Skip to main content

hematite/agent/
inference.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7// ── Engine ────────────────────────────────────────────────────────────────────
8
9pub struct InferenceEngine {
10    pub client: reqwest::Client,
11    pub api_url: String,
12    /// Root URL of the LLM provider (e.g. `http://localhost:1234`).
13    /// All non-completions endpoints (models list, health, embeddings) are derived from this.
14    pub base_url: String,
15    pub species: String,
16    pub snark: u8,
17    pub kv_semaphore: Semaphore,
18    /// The model ID currently loaded in LM Studio (auto-detected on boot).
19    pub model: std::sync::RwLock<String>,
20    /// Context window length in tokens (auto-detected from LM Studio, default 32768).
21    pub context_length: std::sync::atomic::AtomicUsize,
22    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23    /// Optional model ID for worker-level tasks (Swarms / research).
24    pub worker_model: Option<String>,
25    /// Opt-in Gemma-native request shaping. Off by default.
26    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27    /// Global cancellation token for hard-interrupting the inference stream.
28    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32    let lower = model.to_ascii_lowercase();
33    lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37    is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40// ── OpenAI Tool Definition ────────────────────────────────────────────────────
41
42#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44    #[serde(rename = "type")]
45    pub tool_type: String,
46    pub function: ToolFunction,
47    #[serde(skip_serializing, skip_deserializing)]
48    pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53    pub name: String,
54    pub description: String,
55    pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60    RepoRead,
61    RepoWrite,
62    Runtime,
63    Architecture,
64    Toolchain,
65    Verification,
66    Git,
67    Research,
68    Vision,
69    Lsp,
70    Workflow,
71    External,
72    Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77    pub category: ToolCategory,
78    pub mutates_workspace: bool,
79    pub external_surface: bool,
80    pub trust_sensitive: bool,
81    pub read_only_friendly: bool,
82    pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86    if name.starts_with("mcp__") {
87        let lower = name.to_ascii_lowercase();
88        let mutates_workspace = [
89            "__edit",
90            "__write",
91            "__create",
92            "__move",
93            "__delete",
94            "__remove",
95            "__rename",
96            "__replace",
97            "__patch",
98        ]
99        .iter()
100        .any(|needle| lower.contains(needle));
101        return ToolMetadata {
102            category: ToolCategory::External,
103            mutates_workspace,
104            external_surface: true,
105            trust_sensitive: true,
106            read_only_friendly: !mutates_workspace,
107            plan_scope: false,
108        };
109    }
110
111    match name {
112        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113            category: ToolCategory::RepoRead,
114            mutates_workspace: false,
115            external_surface: false,
116            trust_sensitive: false,
117            read_only_friendly: true,
118            plan_scope: true,
119        },
120        "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121            category: ToolCategory::RepoWrite,
122            mutates_workspace: true,
123            external_surface: false,
124            trust_sensitive: true,
125            read_only_friendly: false,
126            plan_scope: true,
127        },
128        "trace_runtime_flow" => ToolMetadata {
129            category: ToolCategory::Architecture,
130            mutates_workspace: false,
131            external_surface: false,
132            trust_sensitive: false,
133            read_only_friendly: true,
134            plan_scope: false,
135        },
136        "describe_toolchain" => ToolMetadata {
137            category: ToolCategory::Toolchain,
138            mutates_workspace: false,
139            external_surface: false,
140            trust_sensitive: false,
141            read_only_friendly: true,
142            plan_scope: false,
143        },
144        "shell" => ToolMetadata {
145            category: ToolCategory::Runtime,
146            mutates_workspace: true,
147            external_surface: false,
148            trust_sensitive: true,
149            read_only_friendly: false,
150            plan_scope: false,
151        },
152        "inspect_host" => ToolMetadata {
153            category: ToolCategory::Runtime,
154            mutates_workspace: false,
155            external_surface: false,
156            trust_sensitive: false,
157            read_only_friendly: true,
158            plan_scope: false,
159        },
160        "resolve_host_issue" => ToolMetadata {
161            category: ToolCategory::Runtime,
162            mutates_workspace: true,
163            external_surface: true,
164            trust_sensitive: true,
165            read_only_friendly: false,
166            plan_scope: false,
167        },
168        "run_hematite_maintainer_workflow" => ToolMetadata {
169            category: ToolCategory::Workflow,
170            mutates_workspace: true,
171            external_surface: false,
172            trust_sensitive: true,
173            read_only_friendly: false,
174            plan_scope: false,
175        },
176        "run_workspace_workflow" => ToolMetadata {
177            category: ToolCategory::Workflow,
178            mutates_workspace: true,
179            external_surface: false,
180            trust_sensitive: true,
181            read_only_friendly: false,
182            plan_scope: false,
183        },
184        "verify_build" => ToolMetadata {
185            category: ToolCategory::Verification,
186            mutates_workspace: false,
187            external_surface: false,
188            trust_sensitive: false,
189            read_only_friendly: true,
190            plan_scope: false,
191        },
192        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
193            ToolMetadata {
194                category: ToolCategory::Git,
195                mutates_workspace: true,
196                external_surface: false,
197                trust_sensitive: true,
198                read_only_friendly: false,
199                plan_scope: false,
200            }
201        }
202        "research_web" | "fetch_docs" => ToolMetadata {
203            category: ToolCategory::Research,
204            mutates_workspace: false,
205            external_surface: false,
206            trust_sensitive: false,
207            read_only_friendly: true,
208            plan_scope: false,
209        },
210        "vision_analyze" => ToolMetadata {
211            category: ToolCategory::Vision,
212            mutates_workspace: false,
213            external_surface: false,
214            trust_sensitive: false,
215            read_only_friendly: true,
216            plan_scope: false,
217        },
218        "lsp_definitions"
219        | "lsp_references"
220        | "lsp_hover"
221        | "lsp_rename_symbol"
222        | "lsp_get_diagnostics"
223        | "lsp_search_symbol" => ToolMetadata {
224            category: ToolCategory::Lsp,
225            mutates_workspace: false,
226            external_surface: false,
227            trust_sensitive: false,
228            read_only_friendly: true,
229            plan_scope: false,
230        },
231        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
232            category: ToolCategory::Workflow,
233            mutates_workspace: false,
234            external_surface: false,
235            trust_sensitive: false,
236            read_only_friendly: true,
237            plan_scope: true,
238        },
239        "manage_tasks" => ToolMetadata {
240            category: ToolCategory::Workflow,
241            mutates_workspace: false,
242            external_surface: false,
243            trust_sensitive: false,
244            read_only_friendly: true,
245            plan_scope: false,
246        },
247        _ => ToolMetadata {
248            category: ToolCategory::Other,
249            mutates_workspace: false,
250            external_surface: false,
251            trust_sensitive: false,
252            read_only_friendly: true,
253            plan_scope: false,
254        },
255    }
256}
257
258// ── Message types ─────────────────────────────────────────────────────────────
259
260/// OpenAI-compatible chat message. Content can be a string (legacy) or a
261/// Vec of ContentPart (multimodal).
262#[derive(Serialize, Deserialize, Clone, Debug)]
263pub struct ChatMessage {
264    pub role: String,
265    /// Support both simple string content and complex multi-part content (Vision).
266    pub content: MessageContent,
267    /// Assistant messages may have tool calls. Default to empty vec, not null.
268    #[serde(default, skip_serializing_if = "Vec::is_empty")]
269    pub tool_calls: Vec<ToolCallResponse>,
270    /// Tool message references the original call.
271    #[serde(skip_serializing_if = "Option::is_none")]
272    pub tool_call_id: Option<String>,
273    /// Tool message name.
274    #[serde(skip_serializing_if = "Option::is_none")]
275    pub name: Option<String>,
276}
277
278#[derive(Serialize, Deserialize, Clone, Debug)]
279#[serde(untagged)]
280pub enum MessageContent {
281    Text(String),
282    Parts(Vec<ContentPart>),
283}
284
285#[derive(Serialize, Deserialize, Clone, Debug)]
286#[serde(tag = "type")]
287pub enum ContentPart {
288    #[serde(rename = "text")]
289    Text { text: String },
290    #[serde(rename = "image_url")]
291    ImageUrl { image_url: ImageUrlSource },
292}
293
294#[derive(Serialize, Deserialize, Clone, Debug)]
295pub struct ImageUrlSource {
296    pub url: String,
297}
298
299impl Default for MessageContent {
300    fn default() -> Self {
301        MessageContent::Text(String::new())
302    }
303}
304
305impl MessageContent {
306    pub fn as_str(&self) -> &str {
307        match self {
308            MessageContent::Text(s) => s,
309            MessageContent::Parts(parts) => {
310                for part in parts {
311                    if let ContentPart::Text { text } = part {
312                        return text;
313                    }
314                }
315                ""
316            }
317        }
318    }
319}
320
321impl ChatMessage {
322    pub fn system(content: &str) -> Self {
323        Self {
324            role: "system".into(),
325            content: MessageContent::Text(content.into()),
326            tool_calls: Vec::new(),
327            tool_call_id: None,
328            name: None,
329        }
330    }
331    pub fn user(content: &str) -> Self {
332        Self {
333            role: "user".into(),
334            content: MessageContent::Text(content.into()),
335            tool_calls: Vec::new(),
336            tool_call_id: None,
337            name: None,
338        }
339    }
340    pub fn user_with_image(text: &str, image_url: &str) -> Self {
341        let mut text_parts = text.to_string();
342        if !text_parts.contains("<|image|>") {
343            text_parts.push_str(" <|image|>");
344        }
345        Self {
346            role: "user".into(),
347            content: MessageContent::Parts(vec![
348                ContentPart::Text { text: text_parts },
349                ContentPart::ImageUrl {
350                    image_url: ImageUrlSource {
351                        url: image_url.into(),
352                    },
353                },
354            ]),
355            tool_calls: Vec::new(),
356            tool_call_id: None,
357            name: None,
358        }
359    }
360    pub fn assistant_text(content: &str) -> Self {
361        Self {
362            role: "assistant".into(),
363            content: MessageContent::Text(content.into()),
364            tool_calls: Vec::new(),
365            tool_call_id: None,
366            name: None,
367        }
368    }
369    pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
370        Self {
371            role: "assistant".into(),
372            content: MessageContent::Text(content.into()),
373            tool_calls: calls,
374            tool_call_id: None,
375            name: None,
376        }
377    }
378    pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
379        Self::tool_result_for_model(tool_call_id, fn_name, content, "")
380    }
381
382    /// Build a tool result message, applying Gemma 4 native markup only when the
383    /// loaded model is actually a Gemma 4 model.
384    pub fn tool_result_for_model(
385        tool_call_id: &str,
386        fn_name: &str,
387        content: &str,
388        model: &str,
389    ) -> Self {
390        let body = if is_gemma4_model_name(model) {
391            format!(
392                "<|tool_response>response:{}{}{}<tool_response|>",
393                fn_name, "{", content
394            )
395        } else {
396            content.to_string()
397        };
398        Self {
399            role: "tool".into(),
400            content: MessageContent::Text(body),
401            tool_calls: Vec::new(),
402            tool_call_id: Some(tool_call_id.into()),
403            name: Some(fn_name.into()),
404        }
405    }
406}
407
408// ── Tool call as returned by the model ───────────────────────────────────────
409
410#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallResponse {
412    pub id: String,
413    #[serde(rename = "type")]
414    pub call_type: String,
415    pub function: ToolCallFn,
416}
417
418#[derive(Serialize, Deserialize, Clone, Debug)]
419pub struct ToolCallFn {
420    pub name: String,
421    /// JSON-encoded arguments string (as returned by the API).
422    pub arguments: String,
423}
424
425// ── HTTP request / response shapes ───────────────────────────────────────────
426
427#[derive(Serialize)]
428struct ChatRequest {
429    model: String,
430    messages: Vec<ChatMessage>,
431    temperature: f32,
432    stream: bool,
433    #[serde(skip_serializing_if = "Option::is_none")]
434    tools: Option<Vec<ToolDefinition>>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ChatResponse {
439    choices: Vec<ResponseChoice>,
440    usage: Option<TokenUsage>,
441}
442
443#[derive(Deserialize, Debug, Clone)]
444pub struct TokenUsage {
445    pub prompt_tokens: usize,
446    pub completion_tokens: usize,
447    pub total_tokens: usize,
448    #[serde(default)]
449    pub prompt_cache_hit_tokens: usize,
450    #[serde(default)]
451    pub cache_read_input_tokens: usize,
452}
453
454#[derive(Deserialize, Debug)]
455struct ResponseChoice {
456    message: ResponseMessage,
457    #[serde(default)]
458    finish_reason: Option<String>,
459}
460
461#[derive(Deserialize, Debug)]
462struct ResponseMessage {
463    content: Option<String>,
464    tool_calls: Option<Vec<ToolCallResponse>>,
465    /// LM Studio routes Qwen3 thinking-mode output here instead of wrapping
466    /// it in <think> tags inside `content`. When tool calls are generated
467    /// inside a think block, they end up here rather than in `tool_calls`.
468    #[serde(default)]
469    reasoning_content: Option<String>,
470}
471
472const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
473const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
474
475fn is_tiny_context_window(context_length: usize) -> bool {
476    context_length <= 8_192
477}
478
479fn is_compact_context_window(context_length: usize) -> bool {
480    context_length > 8_192 && context_length <= 49_152
481}
482
483pub fn is_compact_context_window_pub(context_length: usize) -> bool {
484    is_compact_context_window(context_length)
485}
486
487fn is_provider_context_limit_detail(lower: &str) -> bool {
488    (lower.contains("n_keep") && lower.contains("n_ctx"))
489        || lower.contains("context length")
490        || lower.contains("keep from the initial prompt")
491        || lower.contains("prompt is greater than the context length")
492        || lower.contains("exceeds the context window")
493}
494
495fn classify_runtime_failure_tag(detail: &str) -> &'static str {
496    let lower = detail.to_ascii_lowercase();
497    if lower.contains("context_window_blocked")
498        || lower.contains("context ceiling reached")
499        || lower.contains("exceeds the")
500        || is_provider_context_limit_detail(&lower)
501    {
502        "context_window"
503    } else if lower.contains("empty response from model")
504        || lower.contains("model returned an empty response")
505    {
506        "empty_model_response"
507    } else if lower.contains("action blocked:")
508        || lower.contains("access denied")
509        || lower.contains("declined by user")
510    {
511        "tool_policy_blocked"
512    } else {
513        "provider_degraded"
514    }
515}
516
517fn runtime_failure_guidance(tag: &str) -> &'static str {
518    match tag {
519        "context_window" => {
520            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
521        }
522        "empty_model_response" => {
523            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
524        }
525        "tool_policy_blocked" => {
526            "Stay inside the allowed workflow or switch modes before retrying."
527        }
528        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
529    }
530}
531
532fn format_runtime_failure_message(detail: &str) -> String {
533    let tag = classify_runtime_failure_tag(detail);
534    format!(
535        "[failure:{}] {} Detail: {}",
536        tag,
537        runtime_failure_guidance(tag),
538        detail.trim()
539    )
540}
541
542#[derive(Debug, Clone, Copy, PartialEq, Eq)]
543pub enum ProviderRuntimeState {
544    Booting,
545    Live,
546    Recovering,
547    Degraded,
548    ContextWindow,
549    EmptyResponse,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum McpRuntimeState {
554    Unconfigured,
555    Healthy,
556    Degraded,
557    Failed,
558}
559
560#[derive(Debug, Clone, Copy, PartialEq, Eq)]
561pub enum OperatorCheckpointState {
562    Idle,
563    RecoveringProvider,
564    BudgetReduced,
565    HistoryCompacted,
566    BlockedContextWindow,
567    BlockedPolicy,
568    BlockedRecentFileEvidence,
569    BlockedExactLineWindow,
570    BlockedToolLoop,
571    BlockedVerification,
572}
573
574impl OperatorCheckpointState {
575    pub fn label(self) -> &'static str {
576        match self {
577            OperatorCheckpointState::Idle => "idle",
578            OperatorCheckpointState::RecoveringProvider => "recovering_provider",
579            OperatorCheckpointState::BudgetReduced => "budget_reduced",
580            OperatorCheckpointState::HistoryCompacted => "history_compacted",
581            OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
582            OperatorCheckpointState::BlockedPolicy => "blocked_policy",
583            OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
584            OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
585            OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
586            OperatorCheckpointState::BlockedVerification => "blocked_verification",
587        }
588    }
589}
590
591fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
592    match tag {
593        "context_window" => ProviderRuntimeState::ContextWindow,
594        "empty_model_response" => ProviderRuntimeState::EmptyResponse,
595        _ => ProviderRuntimeState::Degraded,
596    }
597}
598
599fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
600    match tag {
601        "context_window" => {
602            "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
603                .to_string()
604        }
605        "empty_model_response" => {
606            "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
607                .to_string()
608        }
609        "tool_policy_blocked" => {
610            "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
611                .to_string()
612        }
613        _ => {
614            let mut excerpt = detail
615                .split_whitespace()
616                .take(12)
617                .collect::<Vec<_>>()
618                .join(" ");
619            if excerpt.len() > 110 {
620                excerpt.truncate(110);
621                excerpt.push_str("...");
622            }
623            if excerpt.is_empty() {
624                "LM Studio degraded; Hematite will retry once before surfacing a failure."
625                    .to_string()
626            } else {
627                format!("LM Studio degraded: {}", excerpt)
628            }
629        }
630    }
631}
632
633// ── Events pushed to the TUI ──────────────────────────────────────────────────
634
635#[derive(Debug)]
636pub enum InferenceEvent {
637    /// A text token to append to the current assistant message.
638    Token(String),
639    /// A text token to be displayed on screen but NOT spoken (e.g. startup greeting).
640    MutedToken(String),
641    /// Internal model reasoning (shown in side panel, not dialogue).
642    Thought(String),
643    /// Critical diagnostic feedback from the voice synthesis engine.
644    VoiceStatus(String),
645    /// A tool call is starting – show a status line in the TUI.
646    ToolCallStart {
647        id: String,
648        name: String,
649        args: String,
650    },
651    /// A tool call completed – show result in the TUI.
652    ToolCallResult {
653        id: String,
654        name: String,
655        output: String,
656        is_error: bool,
657    },
658    /// A risky tool requires explicit user approval.
659    /// The TUI must send `true` (approved) or `false` (rejected) via `responder`.
660    /// When `diff` is Some, the modal renders a coloured before/after diff preview.
661    ApprovalRequired {
662        id: String,
663        name: String,
664        display: String,
665        /// Pre-formatted diff: lines starting with "- " are removals, "+ " are additions,
666        /// "---" is a file header.  None means a plain high-risk approval (no diff).
667        diff: Option<String>,
668        responder: tokio::sync::oneshot::Sender<bool>,
669    },
670    /// The current agent turn is complete.
671    Done,
672    /// An error occurred during inference.
673    Error(String),
674    /// Compact provider/runtime state for the operator surface.
675    ProviderStatus {
676        state: ProviderRuntimeState,
677        summary: String,
678    },
679    /// Typed operator checkpoint/blocker state for SPECULAR and recovery UIs.
680    OperatorCheckpoint {
681        state: OperatorCheckpointState,
682        summary: String,
683    },
684    /// Typed recovery recipe summary for operator/debug surfaces.
685    RecoveryRecipe { summary: String },
686    /// Compact MCP/runtime server health for the operator surface.
687    McpStatus {
688        state: McpRuntimeState,
689        summary: String,
690    },
691    /// Current compaction pressure against the adaptive threshold.
692    CompactionPressure {
693        estimated_tokens: usize,
694        threshold_tokens: usize,
695        percent: u8,
696    },
697    /// Current total prompt-budget pressure against the live context window.
698    PromptPressure {
699        estimated_input_tokens: usize,
700        reserved_output_tokens: usize,
701        estimated_total_tokens: usize,
702        context_length: usize,
703        percent: u8,
704    },
705    /// A generic task progress update (e.g. for single-agent tool execution).
706    TaskProgress {
707        id: String,
708        label: String,
709        progress: u8,
710    },
711    /// Real-time token usage update from the API.
712    UsageUpdate(TokenUsage),
713    /// The current runtime profile detected from LM Studio.
714    RuntimeProfile {
715        model_id: String,
716        context_length: usize,
717    },
718    /// Vein index status after each incremental re-index.
719    VeinStatus {
720        file_count: usize,
721        embedded_count: usize,
722        docs_only: bool,
723    },
724    /// File paths the Vein surfaced as relevant to the current turn.
725    /// Used to populate ACTIVE CONTEXT with retrieval results.
726    VeinContext { paths: Vec<String> },
727    /// A new companion was hatched mid-session via /reroll.
728    SoulReroll {
729        species: String,
730        rarity: String,
731        shiny: bool,
732        personality: String,
733    },
734    /// Embed model loaded/unloaded mid-session.
735    EmbedProfile { model_id: Option<String> },
736    /// A single line of live shell output, streamed while the command runs.
737    /// Displayed in the SPECULAR panel so the operator sees progress without
738    /// waiting for the full command to finish.
739    ShellLine(String),
740}
741
742// ── Engine implementation ─────────────────────────────────────────────────────
743
744impl InferenceEngine {
745    pub fn new(
746        api_url: String,
747        species: String,
748        snark: u8,
749    ) -> Result<Self, Box<dyn std::error::Error>> {
750        let client = reqwest::Client::builder()
751            .timeout(std::time::Duration::from_secs(180))
752            .build()?;
753
754        // Extract http://host:port as the base for all non-completions endpoints.
755        let base_url = {
756            let trimmed = api_url.trim_end_matches('/');
757            if let Some(scheme_end) = trimmed.find("://") {
758                let after_scheme = &trimmed[scheme_end + 3..];
759                if let Some(path_start) = after_scheme.find('/') {
760                    format!(
761                        "{}://{}",
762                        &trimmed[..scheme_end],
763                        &after_scheme[..path_start]
764                    )
765                } else {
766                    trimmed.to_string()
767                }
768            } else {
769                trimmed.to_string()
770            }
771        };
772
773        let api_url = if api_url.ends_with("/chat/completions") {
774            api_url
775        } else if api_url.ends_with("/") {
776            format!("{}chat/completions", api_url)
777        } else {
778            format!("{}/chat/completions", api_url)
779        };
780
781        Ok(Self {
782            client,
783            api_url,
784            base_url,
785            species,
786            snark,
787            kv_semaphore: Semaphore::new(3),
788            model: std::sync::RwLock::new(String::new()),
789            context_length: std::sync::atomic::AtomicUsize::new(32_768), // Gemma-4 Sweet Spot (32K)
790            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
791            worker_model: None,
792            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
793            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
794        })
795    }
796
797    pub fn set_gemma_native_formatting(&self, enabled: bool) {
798        self.gemma_native_formatting
799            .store(enabled, std::sync::atomic::Ordering::SeqCst);
800    }
801
802    pub fn gemma_native_formatting_enabled(&self) -> bool {
803        self.gemma_native_formatting
804            .load(std::sync::atomic::Ordering::SeqCst)
805    }
806
807    pub fn current_model(&self) -> String {
808        self.model.read().map(|g| g.clone()).unwrap_or_default()
809    }
810
811    pub fn current_context_length(&self) -> usize {
812        self.context_length
813            .load(std::sync::atomic::Ordering::SeqCst)
814    }
815
816    pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
817        if let Ok(mut guard) = self.model.write() {
818            *guard = model.to_string();
819        }
820        self.context_length
821            .store(context_length, std::sync::atomic::Ordering::SeqCst);
822    }
823
824    /// Returns true if LM Studio is reachable.
825    pub async fn health_check(&self) -> bool {
826        let url = format!("{}/v1/models", self.base_url);
827        match self.client.get(&url).send().await {
828            Ok(resp) => resp.status().is_success(),
829            Err(_) => false,
830        }
831    }
832
833    /// Query /api/v0/models and return the first loaded chat model id.
834    /// Uses /api/v0/models (not /v1/models) because the OpenAI-compat endpoint
835    /// omits the `type` field, making it impossible to distinguish embedding
836    /// models from chat models. Falls back to /v1/models with a name heuristic
837    /// if /api/v0/models is unavailable.
838    /// Returns Some("") when LM Studio is reachable but no chat model is loaded
839    /// so callers can distinguish "offline" (None) from "no chat model" (Some("")).
840    pub async fn get_loaded_model(&self) -> Option<String> {
841        #[derive(Deserialize)]
842        struct ModelList {
843            data: Vec<ModelEntry>,
844        }
845        #[derive(Deserialize)]
846        struct ModelEntry {
847            id: String,
848            #[serde(rename = "type", default)]
849            model_type: String,
850            #[serde(default)]
851            state: String,
852        }
853
854        // Try /api/v0/models first — it has type and state fields.
855        if let Ok(resp) = self
856            .client
857            .get(format!("{}/api/v0/models", self.base_url))
858            .send()
859            .await
860        {
861            if let Ok(list) = resp.json::<ModelList>().await {
862                let chat_model = list
863                    .data
864                    .into_iter()
865                    .find(|m| m.model_type != "embeddings" && m.state == "loaded")
866                    .map(|m| m.id)
867                    .unwrap_or_default();
868                return Some(chat_model);
869            }
870        }
871
872        // Fallback: /v1/models lacks type info — use name heuristic to skip embed models.
873        let resp = self
874            .client
875            .get(format!("{}/v1/models", self.base_url))
876            .send()
877            .await
878            .ok()?;
879        let list: ModelList = resp.json().await.ok()?;
880        Some(
881            list.data
882                .into_iter()
883                .find(|m| !m.id.to_lowercase().contains("embed"))
884                .map(|m| m.id)
885                .unwrap_or_default(),
886        )
887    }
888
889    /// Returns the ID of the first loaded embedding model, if any.
890    /// Uses /api/v0/models which includes `type` and `state` fields.
891    /// The OpenAI-compat /v1/models endpoint omits `type` so cannot be used here.
892    /// Accepts any non-empty state (not just "loaded") to handle LM Studio variants
893    /// where the embed model may report a different state string at startup.
894    pub async fn get_embedding_model(&self) -> Option<String> {
895        #[derive(Deserialize)]
896        struct ModelList {
897            data: Vec<ModelEntry>,
898        }
899        #[derive(Deserialize)]
900        struct ModelEntry {
901            id: String,
902            #[serde(rename = "type", default)]
903            model_type: String,
904            #[serde(default)]
905            state: String,
906        }
907        let resp = self
908            .client
909            .get(format!("{}/api/v0/models", self.base_url))
910            .send()
911            .await
912            .ok()?;
913        let list: ModelList = resp.json().await.ok()?;
914        list.data
915            .into_iter()
916            .find(|m| m.model_type == "embeddings" && m.state == "loaded")
917            .map(|m| m.id)
918    }
919
920    /// Detect the loaded model's context window size.
921    /// Tries LM Studio's `/api/v0/models` endpoint first and prefers the loaded
922    /// model's live `loaded_context_length`, then falls back to older
923    /// `context_length` / `max_context_length` style fields.
924    /// Falls back to a heuristic from the model name, then 32K.
925    pub async fn detect_context_length(&self) -> usize {
926        #[derive(Deserialize)]
927        struct LmStudioModel {
928            id: Option<String>,
929            #[serde(rename = "type", default)]
930            model_type: String,
931            state: Option<String>,
932            loaded_context_length: Option<u64>,
933            context_length: Option<u64>,
934            max_context_length: Option<u64>,
935        }
936        #[derive(Deserialize)]
937        struct LmStudioList {
938            data: Vec<LmStudioModel>,
939        }
940
941        // Check api/v0/models (LM Studio specific)
942        if let Ok(resp) = self
943            .client
944            .get(format!("{}/api/v0/models", self.base_url))
945            .send()
946            .await
947        {
948            if let Ok(list) = resp.json::<LmStudioList>().await {
949                let target_model = self.current_model().to_ascii_lowercase();
950                // Never select embedding models for context-length detection.
951                let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
952                let loaded = list
953                    .data
954                    .iter()
955                    .find(|m| {
956                        non_embed(m)
957                            && m.state.as_deref() == Some("loaded")
958                            && m.id
959                                .as_deref()
960                                .map(|id| id.eq_ignore_ascii_case(&target_model))
961                                .unwrap_or(false)
962                    })
963                    .or_else(|| {
964                        list.data
965                            .iter()
966                            .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
967                    })
968                    .or_else(|| {
969                        list.data.iter().find(|m| {
970                            non_embed(m)
971                                && m.id
972                                    .as_deref()
973                                    .map(|id| id.eq_ignore_ascii_case(&target_model))
974                                    .unwrap_or(false)
975                        })
976                    })
977                    .or_else(|| list.data.iter().find(|m| non_embed(m)));
978
979                if let Some(model) = loaded {
980                    if let Some(ctx) = model.loaded_context_length {
981                        if ctx > 0 {
982                            return ctx as usize;
983                        }
984                    }
985                    if let Some(ctx) = model.context_length {
986                        if ctx > 0 {
987                            return ctx as usize;
988                        }
989                    }
990                    if let Some(ctx) = model.max_context_length {
991                        if ctx > 0 && ctx <= 32_768 {
992                            return ctx as usize;
993                        }
994                    }
995                }
996            }
997        }
998
999        // Heuristic fallback:
1000        // If "gemma-4" is detected, we target 32,768 as the baseline standard,
1001        // acknowledging that 131,072 is available for High-Capacity tasks.
1002        if self.current_model().to_lowercase().contains("gemma-4") {
1003            return 32_768;
1004        }
1005
1006        32_768
1007    }
1008
1009    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
1010        let previous_model = self.current_model();
1011        let previous_context = self.current_context_length();
1012
1013        let detected_model = match self.get_loaded_model().await {
1014            Some(m) if !m.is_empty() => m,            // coding model found
1015            Some(_) => "no model loaded".to_string(), // reachable but no coding model
1016            None => previous_model.clone(),           // LM Studio offline
1017        };
1018
1019        if !detected_model.is_empty() && detected_model != previous_model {
1020            if let Ok(mut guard) = self.model.write() {
1021                *guard = detected_model.clone();
1022            }
1023        }
1024
1025        let detected_context = self.detect_context_length().await;
1026        let effective_model = if detected_model.is_empty() {
1027            previous_model.clone()
1028        } else {
1029            detected_model
1030        };
1031
1032        let changed = effective_model != previous_model || detected_context != previous_context;
1033        self.set_runtime_profile(&effective_model, detected_context);
1034
1035        Some((effective_model, detected_context, changed))
1036    }
1037
1038    pub fn build_system_prompt(
1039        &self,
1040        snark: u8,
1041        chaos: u8,
1042        brief: bool,
1043        professional: bool,
1044        tools: &[ToolDefinition],
1045        reasoning_history: Option<&str>,
1046        mcp_tools: &[crate::agent::mcp::McpTool],
1047    ) -> String {
1048        let mut sys = self.build_system_prompt_legacy(
1049            snark,
1050            chaos,
1051            brief,
1052            professional,
1053            tools,
1054            reasoning_history,
1055        );
1056
1057        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1058            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1059            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1060            for tool in mcp_tools {
1061                let description = tool
1062                    .description
1063                    .as_deref()
1064                    .unwrap_or("No description provided.");
1065                sys.push_str(&format!("- {}: {}\n", tool.name, description));
1066            }
1067        }
1068
1069        sys
1070    }
1071
1072    pub fn build_system_prompt_legacy(
1073        &self,
1074        snark: u8,
1075        _chaos: u8,
1076        brief: bool,
1077        professional: bool,
1078        tools: &[ToolDefinition],
1079        reasoning_history: Option<&str>,
1080    ) -> String {
1081        let current_context_length = self.current_context_length();
1082        if is_tiny_context_window(current_context_length) {
1083            return self.build_system_prompt_tiny(brief, professional);
1084        }
1085        if is_compact_context_window(current_context_length) {
1086            return self.build_system_prompt_compact(brief, professional, tools);
1087        }
1088
1089        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
1090        let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1091                                     - You are Hematite, a local coding system working on the user's machine.\n\
1092                                     - The running Hematite build is ");
1093        sys.push_str(&crate::hematite_version_display());
1094        sys.push_str(".\n\
1095                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1096                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1097                                     - For simple questions, answer briefly in plain language.\n\
1098                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1099                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1100                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1101                                     - Keep internal reasoning inside channel delimiters.\n\
1102                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1103                                     <turn|>\n\n");
1104
1105        if let Some(history) = reasoning_history {
1106            if !history.is_empty() {
1107                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1108                sys.push_str(history);
1109                sys.push_str("\n\n");
1110            }
1111        }
1112
1113        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
1114        if brief {
1115            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1116                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1117                          - Depth: Surface-level verification only.\n\n");
1118        } else {
1119            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1120                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1121                          - Depth: Full multi-step derivation required.\n\n");
1122        }
1123
1124        // IDENTITY & ENVIRONMENT
1125        let os = std::env::consts::OS;
1126        if professional {
1127            sys.push_str(&format!(
1128                "You are Hematite, a local coding system running on {}. \
1129                 The TUI is one interface layer, not your whole identity. \
1130                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1131                 Skip filler and keep the focus on the work.\n",
1132                os
1133            ));
1134        } else {
1135            sys.push_str(&format!(
1136                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1137                 The terminal UI is only one surface of the system. \
1138                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1139                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1140                self.species, snark, os
1141            ));
1142        }
1143
1144        // Inject loaded model and context window so the model knows its own budget.
1145        let current_model = self.current_model();
1146        if !current_model.is_empty() {
1147            sys.push_str(&format!(
1148                "Loaded model: {} | Context window: {} tokens. \
1149                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1150                current_model, current_context_length
1151            ));
1152            if is_gemma4_model_name(&current_model) {
1153                sys.push_str(
1154                    "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1155                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1156                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1157                );
1158            }
1159        } else {
1160            sys.push_str(&format!(
1161                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1162                current_context_length
1163            ));
1164        }
1165
1166        // PROTOCOL & TOOLS
1167        let shell_desc = if cfg!(target_os = "windows") {
1168            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1169             - Use ONLY for builds, tests, or file migrations. \n\
1170             - You MUST use the `powershell` tool directly. \n\
1171             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1172        } else {
1173            "[EXTERNAL SHELL]: `bash` (Unix).\n\
1174             - Use ONLY for builds, tests, or file migrations. \n\
1175             - NEVER wrap bash in other shells. \n\n"
1176        };
1177
1178        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1179                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1180                      - These are the ONLY way to explore and modify code. \n\
1181                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1182        sys.push_str(shell_desc);
1183
1184        // ANTI-LOOPING & SELF-AUDIT
1185        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1186                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1187
1188        if brief {
1189            sys.push_str(
1190                "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1191            );
1192        }
1193
1194        sys.push_str("## CORE DIRECTIVES\n\
1195                       1. REASONING: Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1196                       2. CONCISENESS: After <think>, output ONE concise technical sentence or code block. Nothing else.\n\
1197                       3. GROUNDEDNESS: Never invent tools, channels, or files. If a detail is not verified from tool output, say `uncertain`. Answer from stable Hematite capabilities unless repo implementation is requested.\n\n\
1198                       ## ARCHITECTURAL DISCIPLINE\n\
1199                       - HOST INSPECTION PRIORITY: MANDATORY. For all diagnostic questions (load, CPU/RAM, processes, toolchains, network, ports, OS config, log-checks), prefer `inspect_host` over raw `shell`. If `env_doctor` answers, do not follow with `path` unless requested.\n\
1200                       - WORKFLOW PRIORITY: Prefer `run_workspace_workflow` for project builds/tests and `run_hematite_maintainer_workflow` for app-level scripts over raw `shell`.\n\
1201                       - PROOF BEFORE ACTION: `read_file` or `inspect_lines` before editing. For files >200 lines, `grep_files` for a pattern BEFORE reading.\n\
1202                       - PROOF BEFORE COMMIT: Do not `git_commit` until a successful `verify_build` exists for the latest changes.\n\
1203                       - BUILT-IN FIRST: Prefer internal file tools over `mcp__filesystem__*` unless MCP is explicitly required.\n\n\
1204                       ## TECHNIQUE & SAFETY\n\
1205                       - SHELL DISCIPLINE: Risky `shell` calls REQUIRE a `reason` argument. Always use `powershell` on Windows; never `bash` or `/dev/null`.\n\
1206                       - EDIT & TOOL PRECISION: Use unique lines/anchors for `edit_file`. Do not call tools like 'think' or 'reasoning'. Paths and symbols are NOT tool names.\n\
1207                       - CONTEXT AWARENESS: Answer at the harness level (file ops, shell, build). Prefer real language examples (Python, C#, TS, Go). Never mention `mcp__*` tools unless active and relevant.\n\
1208                       - TOOLING DISCIPLINE: Prefer `describe_toolchain` over improvising tool surface from memory; preserve its identifiers exactly.");
1209
1210        // Scaffolding protocol — enforces build validation after project creation.
1211        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1212            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1213            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1214            4. Fix all errors before declaring success.\n\n\
1215            ## PRE-FLIGHT SCOPING PROTOCOL\n\
1216            Before attempting any multi-file task or complex refactor:\n\
1217            1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1218            2. Use `auto_pin_context` to keep those files in active context.\n\
1219            3. Only then proceed to deeper edits or research.\n\n\
1220            ## REFACTORING PROTOCOL\n\
1221            When modifying existing code or renaming symbols:\n\
1222            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1223            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1224            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1225
1226        // Inject CLAUDE.md / instruction files from the project directory.
1227        sys.push_str(&load_instruction_files());
1228
1229        // Inject cross-session memories synthesized by DeepReflect.
1230        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1231
1232        // Native Gemma-4 Tool Declarations
1233        if !tools.is_empty() {
1234            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1235            for tool in tools {
1236                let schema = serde_json::to_string(&tool.function.parameters)
1237                    .unwrap_or_else(|_| "{}".to_string());
1238                sys.push_str(&format!(
1239                    "<|tool>declaration:{}{}{}<tool|>\n",
1240                    tool.function.name, "{", schema
1241                ));
1242                sys.push_str(&format!("// {})\n", tool.function.description));
1243            }
1244        }
1245
1246        sys
1247    }
1248
1249    fn build_system_prompt_compact(
1250        &self,
1251        brief: bool,
1252        professional: bool,
1253        tools: &[ToolDefinition],
1254    ) -> String {
1255        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
1256        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
1257        let current_model = self.current_model();
1258        let current_context_length = self.current_context_length();
1259        let os = std::env::consts::OS;
1260
1261        let mut sys = String::from("<|turn>system\n<|think|>\n");
1262        sys.push_str(&format!(
1263            "You are Hematite {}, a local coding harness working on the user's machine.\n",
1264            crate::hematite_version_display()
1265        ));
1266        if professional {
1267            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1268        } else {
1269            sys.push_str(&format!(
1270                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1271                self.species
1272            ));
1273        }
1274        sys.push_str(&format!(
1275            "Model: {} | Context: {} tokens. Keep turns focused.\n",
1276            current_model, current_context_length
1277        ));
1278        if is_gemma4_model_name(&current_model) {
1279            sys.push_str(
1280                "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1281                 Raw regex patterns in grep_files, no slash delimiters.\n",
1282            );
1283        }
1284        if cfg!(target_os = "windows") {
1285            sys.push_str(&format!(
1286                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1287                os
1288            ));
1289        } else {
1290            sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1291        }
1292        if brief {
1293            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1294        }
1295
1296        sys.push_str(
1297            "\nCORE RULES:\n\
1298             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1299             - Verify after edits: run `verify_build` after code changes, before committing.\n\
1300             - One tool at a time. Do not batch unrelated tool calls.\n\
1301             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1302             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1303             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1304        );
1305
1306        if !tools.is_empty() {
1307            sys.push_str("\n# AVAILABLE TOOLS\n");
1308            for tool in tools {
1309                let desc: String = tool.function.description.chars().take(120).collect();
1310                sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1311            }
1312        }
1313
1314        sys.push_str("<turn|>\n");
1315        sys
1316    }
1317
1318    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1319        let current_model = self.current_model();
1320        let current_context_length = self.current_context_length();
1321        let os = std::env::consts::OS;
1322        let mut sys = format!(
1323            "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1324            crate::hematite_version_display()
1325        );
1326        if professional {
1327            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1328        } else {
1329            sys.push_str(&format!(
1330                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1331                self.species
1332            ));
1333        }
1334        if !current_model.is_empty() {
1335            sys.push_str(&format!(
1336                "Loaded model: {} | Context window: {} tokens.\n",
1337                current_model, current_context_length
1338            ));
1339        } else {
1340            sys.push_str(&format!(
1341                "Context window: {} tokens.\n",
1342                current_context_length
1343            ));
1344        }
1345        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1346        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1347        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1348        if cfg!(target_os = "windows") {
1349            sys.push_str(&format!(
1350                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1351                os
1352            ));
1353        } else {
1354            sys.push_str(&format!(
1355                "You are running on {}. Use the native Unix shell conventions.\n",
1356                os
1357            ));
1358        }
1359        if brief {
1360            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1361        }
1362        if is_gemma4_model_name(&current_model) {
1363            sys.push_str(
1364                "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1365            );
1366        }
1367        sys.push_str("<turn|>\n");
1368        sys
1369    }
1370
1371    // ── Non-streaming call (used for agentic turns with tool support) ─────────
1372
1373    /// Send messages to the model. Returns (text_content, tool_calls).
1374    /// Exactly one of the two will be Some on a successful response.
1375    pub async fn call_with_tools(
1376        &self,
1377        messages: &[ChatMessage],
1378        tools: &[ToolDefinition],
1379        // Override the model ID for this call. None = use the live runtime model.
1380        model_override: Option<&str>,
1381    ) -> Result<
1382        (
1383            Option<String>,
1384            Option<Vec<ToolCallResponse>>,
1385            Option<TokenUsage>,
1386            Option<String>,
1387        ),
1388        String,
1389    > {
1390        let _permit = self
1391            .kv_semaphore
1392            .acquire()
1393            .await
1394            .map_err(|e| e.to_string())?;
1395
1396        let current_model = self.current_model();
1397        let model = model_override.unwrap_or(current_model.as_str()).to_string();
1398        let filtered_tools = if cfg!(target_os = "windows") {
1399            tools
1400                .iter()
1401                .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1402                .cloned()
1403                .collect::<Vec<_>>()
1404        } else {
1405            tools.to_vec()
1406        };
1407
1408        let request_messages = if should_use_gemma_native_formatting(self, &model) {
1409            prepare_gemma_native_messages(messages)
1410        } else {
1411            messages.to_vec()
1412        };
1413
1414        // In compact context windows, restrict tools to the core coding set.
1415        // Full schemas for 36+ tools add 10k+ tokens via the model's chat template (e.g. Gemma 4).
1416        // Sending a small core set keeps schemas available for structured tool-call dispatch
1417        // while staying within the 16k budget.
1418        const COMPACT_CORE_TOOLS: &[&str] = &[
1419            "read_file",
1420            "inspect_lines",
1421            "edit_file",
1422            "write_file",
1423            "grep_files",
1424            "list_files",
1425            "verify_build",
1426            "shell",
1427        ];
1428        let effective_tools = if is_compact_context_window(self.current_context_length()) {
1429            let core: Vec<_> = filtered_tools
1430                .iter()
1431                .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1432                .cloned()
1433                .collect();
1434            if core.is_empty() {
1435                None
1436            } else {
1437                Some(core)
1438            }
1439        } else if filtered_tools.is_empty() {
1440            None
1441        } else {
1442            Some(filtered_tools)
1443        };
1444
1445        let request = ChatRequest {
1446            model: model.clone(),
1447            messages: request_messages,
1448            temperature: 0.2,
1449            stream: false,
1450            tools: effective_tools,
1451        };
1452
1453        // Exponential backoff: retry up to 3× on 5xx / timeout / connect errors.
1454        preflight_chat_request(
1455            &model,
1456            &request.messages,
1457            request.tools.as_deref().unwrap_or(&[]),
1458            self.current_context_length(),
1459        )?;
1460
1461        let mut last_err = String::new();
1462        let mut response_opt: Option<reqwest::Response> = None;
1463        for attempt in 0..3u32 {
1464            match self.client.post(&self.api_url).json(&request).send().await {
1465                Ok(res) if res.status().is_success() => {
1466                    response_opt = Some(res);
1467                    break;
1468                }
1469                Ok(res) if res.status().as_u16() >= 500 => {
1470                    last_err = format!("LM Studio error {}", res.status());
1471                }
1472                Ok(res) => {
1473                    // 4xx — don't retry
1474                    let status = res.status();
1475                    let body = res.text().await.unwrap_or_default();
1476                    let preview = &body[..body.len().min(300)];
1477                    return Err(format!("LM Studio error {}: {}", status, preview));
1478                }
1479                Err(e) if e.is_timeout() || e.is_connect() => {
1480                    last_err = format!("Request failed: {}", e);
1481                }
1482                Err(e) => return Err(format!("Request failed: {}", e)),
1483            }
1484            if attempt < 2 {
1485                let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1486                tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1487            }
1488        }
1489        let res = response_opt
1490            .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1491
1492        let body: ChatResponse = res
1493            .json()
1494            .await
1495            .map_err(|e| format!("Response parse error: {}", e))?;
1496
1497        if let Some(usage) = &body.usage {
1498            let mut econ = self.economics.lock().unwrap();
1499            econ.input_tokens += usage.prompt_tokens;
1500            econ.output_tokens += usage.completion_tokens;
1501        }
1502
1503        let choice = body
1504            .choices
1505            .into_iter()
1506            .next()
1507            .ok_or_else(|| "Empty response from model".to_string())?;
1508
1509        let finish_reason = choice.finish_reason;
1510        let mut tool_calls = choice.message.tool_calls;
1511        let mut content = choice.message.content;
1512
1513        // Gemma-4 Fallback: If the model outputs native <|tool_call|> tags in the text content,
1514        // extract them and treat them as valid tool calls.
1515        if let Some(raw_content) = &content {
1516            let native_calls = extract_native_tool_calls(raw_content);
1517            if !native_calls.is_empty() {
1518                let mut existing = tool_calls.unwrap_or_default();
1519                existing.extend(native_calls);
1520                tool_calls = Some(existing);
1521                let stripped = strip_native_tool_call_text(raw_content);
1522                content = if stripped.trim().is_empty() {
1523                    None
1524                } else {
1525                    Some(stripped)
1526                };
1527            }
1528        }
1529
1530        if is_gemma4_model_name(&model) {
1531            if let Some(calls) = tool_calls.as_mut() {
1532                for call in calls.iter_mut() {
1533                    call.function.arguments = normalize_tool_argument_string(
1534                        &call.function.name,
1535                        &call.function.arguments,
1536                    );
1537                }
1538            }
1539        }
1540
1541        // Qwen3 Fallback: When the model generates tool calls inside a <think> block,
1542        // LM Studio routes the entire thinking output (including <tool_call> XML) to
1543        // `reasoning_content` instead of `tool_calls`. If content is empty and we have
1544        // no tool calls yet, check reasoning_content for embedded tool call markup.
1545        let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1546        if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1547            && content
1548                .as_ref()
1549                .map(|s| s.trim().is_empty())
1550                .unwrap_or(true)
1551            && !reasoning_text.is_empty()
1552        {
1553            let recovered = extract_native_tool_calls(&reasoning_text);
1554            if !recovered.is_empty() {
1555                tool_calls = Some(recovered);
1556                // Clear content so downstream code doesn't see an empty string.
1557                content = None;
1558            } else if finish_reason.as_deref() == Some("stop") {
1559                // Qwen3.5 thinking-mode answer leak: the model wrote its final answer
1560                // inside reasoning_content and left content empty. finish_reason=stop
1561                // with no tool calls means the model intended this as its response.
1562                // Surface reasoning_content as the visible response rather than
1563                // firing the empty-response nudge loop.
1564                content = Some(reasoning_text);
1565            }
1566        }
1567
1568        Ok((content, tool_calls, body.usage, finish_reason))
1569    }
1570
1571    // ── Streaming call (used for plain-text responses) ────────────────────────
1572
1573    /// Stream a conversation (no tools). Emits Token/Done/Error events.
1574    pub async fn stream_messages(
1575        &self,
1576        messages: &[ChatMessage],
1577        tx: mpsc::Sender<InferenceEvent>,
1578    ) -> Result<(), Box<dyn std::error::Error>> {
1579        let current_model = self.current_model();
1580        let request_messages = if should_use_gemma_native_formatting(self, &current_model) {
1581            prepare_gemma_native_messages(messages)
1582        } else {
1583            messages
1584                .iter()
1585                .map(|m| {
1586                    let mut clone = m.clone();
1587                    let current_text = m.content.as_str();
1588                    if !current_text.starts_with("<|turn>") {
1589                        clone.content = MessageContent::Text(format!(
1590                            "<|turn>{}\n{}\n<turn|>",
1591                            m.role, current_text
1592                        ));
1593                    }
1594                    clone
1595                })
1596                .collect()
1597        };
1598
1599        let request = ChatRequest {
1600            model: current_model.clone(),
1601            messages: request_messages,
1602            temperature: 0.7,
1603            stream: true,
1604            tools: None,
1605        };
1606
1607        if let Err(e) = preflight_chat_request(
1608            &current_model,
1609            &request.messages,
1610            &[],
1611            self.current_context_length(),
1612        ) {
1613            let tag = classify_runtime_failure_tag(&e);
1614            let _ = tx
1615                .send(InferenceEvent::ProviderStatus {
1616                    state: provider_state_for_failure_tag(tag),
1617                    summary: compact_runtime_failure_summary(tag, &e),
1618                })
1619                .await;
1620            let _ = tx
1621                .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1622                .await;
1623            let _ = tx.send(InferenceEvent::Done).await;
1624            return Ok(());
1625        }
1626
1627        let mut last_err = String::new();
1628        let mut response_opt: Option<reqwest::Response> = None;
1629        for attempt in 0..2u32 {
1630            match self.client.post(&self.api_url).json(&request).send().await {
1631                Ok(res) if res.status().is_success() => {
1632                    response_opt = Some(res);
1633                    break;
1634                }
1635                Ok(res) if res.status().as_u16() >= 500 => {
1636                    last_err = format!("LM Studio error {}", res.status());
1637                }
1638                Ok(res) => {
1639                    let status = res.status();
1640                    let body = res.text().await.unwrap_or_default();
1641                    let preview = &body[..body.len().min(300)];
1642                    let detail = format!("LM Studio error {}: {}", status, preview);
1643                    let tag = classify_runtime_failure_tag(&detail);
1644                    let _ = tx
1645                        .send(InferenceEvent::ProviderStatus {
1646                            state: provider_state_for_failure_tag(tag),
1647                            summary: compact_runtime_failure_summary(tag, &detail),
1648                        })
1649                        .await;
1650                    let _ = tx
1651                        .send(InferenceEvent::Error(format_runtime_failure_message(
1652                            &detail,
1653                        )))
1654                        .await;
1655                    let _ = tx.send(InferenceEvent::Done).await;
1656                    return Ok(());
1657                }
1658                Err(e) if e.is_timeout() || e.is_connect() => {
1659                    last_err = format!("Request failed: {}", e);
1660                }
1661                Err(e) => {
1662                    let detail = format!("Request failed: {}", e);
1663                    let tag = classify_runtime_failure_tag(&detail);
1664                    let _ = tx
1665                        .send(InferenceEvent::ProviderStatus {
1666                            state: provider_state_for_failure_tag(tag),
1667                            summary: compact_runtime_failure_summary(tag, &detail),
1668                        })
1669                        .await;
1670                    let _ = tx
1671                        .send(InferenceEvent::Error(format_runtime_failure_message(
1672                            &detail,
1673                        )))
1674                        .await;
1675                    let _ = tx.send(InferenceEvent::Done).await;
1676                    return Ok(());
1677                }
1678            }
1679            if attempt < 1 {
1680                let _ = tx
1681                    .send(InferenceEvent::ProviderStatus {
1682                        state: ProviderRuntimeState::Recovering,
1683                        summary: "LM Studio degraded during stream startup; retrying once.".into(),
1684                    })
1685                    .await;
1686                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1687            }
1688        }
1689        let Some(res) = response_opt else {
1690            let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1691            let tag = classify_runtime_failure_tag(&detail);
1692            let _ = tx
1693                .send(InferenceEvent::ProviderStatus {
1694                    state: provider_state_for_failure_tag(tag),
1695                    summary: compact_runtime_failure_summary(tag, &detail),
1696                })
1697                .await;
1698            let _ = tx
1699                .send(InferenceEvent::Error(format_runtime_failure_message(
1700                    &detail,
1701                )))
1702                .await;
1703            let _ = tx.send(InferenceEvent::Done).await;
1704            return Ok(());
1705        };
1706
1707        use futures::StreamExt;
1708        let mut byte_stream = res.bytes_stream();
1709
1710        // [Collaborative Strategy] TokenBuffer refactor suggested by Hematite local agent.
1711        // Aggregates tokens to ensure coherent linguistic chunks for UI/Voice.
1712        let mut line_buffer = String::new();
1713        let mut content_buffer = String::new();
1714        let mut past_think = false;
1715        let mut emitted_any_content = false;
1716        let mut emitted_live_status = false;
1717
1718        // Immediate cancel gate: break *before* awaiting the stream
1719        // so Escape works even when LM Studio is silent between chunks.
1720        loop {
1721            let next = tokio::select! {
1722                // Race: next SSE chunk vs cancel poll
1723                chunk = byte_stream.next() => chunk,
1724                _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1725                    if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1726                        break;
1727                    }
1728                    continue;
1729                }
1730            };
1731
1732            let Some(item) = next else { break };
1733
1734            let chunk = match item {
1735                Ok(chunk) => chunk,
1736                Err(e) => {
1737                    let detail = format!("Request failed: {}", e);
1738                    let tag = classify_runtime_failure_tag(&detail);
1739                    let _ = tx
1740                        .send(InferenceEvent::ProviderStatus {
1741                            state: provider_state_for_failure_tag(tag),
1742                            summary: compact_runtime_failure_summary(tag, &detail),
1743                        })
1744                        .await;
1745                    let _ = tx
1746                        .send(InferenceEvent::Error(format_runtime_failure_message(
1747                            &detail,
1748                        )))
1749                        .await;
1750                    let _ = tx.send(InferenceEvent::Done).await;
1751                    return Ok(());
1752                }
1753            };
1754            line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1755
1756            while let Some(pos) = line_buffer.find("\n\n") {
1757                let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1758                let data_pos = match event_str.find("data: ") {
1759                    Some(p) => p,
1760                    None => continue,
1761                };
1762
1763                let data = event_str[data_pos + 6..].trim();
1764                if data == "[DONE]" {
1765                    break;
1766                }
1767
1768                if let Ok(json) = serde_json::from_str::<Value>(data) {
1769                    let delta = &json["choices"][0]["delta"];
1770
1771                    // Process reasoning/thought deltas (Qwen/O1 style)
1772                    if let Some(reasoning) = delta["reasoning_content"]
1773                        .as_str()
1774                        .or_else(|| delta["thought"].as_str())
1775                    {
1776                        if !reasoning.is_empty() {
1777                            past_think = false; // We are in reasoning mode
1778                            content_buffer.push_str(reasoning);
1779                            if content_buffer.len() > 30
1780                                && (reasoning.contains('\n') || reasoning.contains('.'))
1781                            {
1782                                let _ = tx
1783                                    .send(InferenceEvent::Thought(content_buffer.clone()))
1784                                    .await;
1785                                emitted_any_content = true;
1786                                content_buffer.clear();
1787                            }
1788                        }
1789                    }
1790
1791                    // Process standard content deltas
1792                    if let Some(content) = delta["content"].as_str() {
1793                        if content.is_empty() {
1794                            continue;
1795                        }
1796
1797                        // Auto-transition: if we have content but were in 'thinking' mode,
1798                        // and haven't seen an explicit tag, assume reasoning is over once content is non-empty.
1799                        if !past_think && !content_buffer.is_empty() && !content.trim().is_empty() {
1800                            // Only transition if the content isn't logically part of the thinking
1801                            // block (some models mix them). Standard heuristic: first non-whitespace content.
1802                            let _ = tx
1803                                .send(InferenceEvent::Thought(content_buffer.clone()))
1804                                .await;
1805                            content_buffer.clear();
1806                            past_think = true;
1807                        }
1808
1809                        if !past_think {
1810                            let lc = content.to_lowercase();
1811                            let close = lc
1812                                .find("<channel|>")
1813                                .map(|i| (i, "<channel|>".len()))
1814                                .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1815
1816                            if let Some((tag_start, tag_len)) = close {
1817                                // Flush any existing thought buffer
1818                                let before = &content[..tag_start];
1819                                content_buffer.push_str(before);
1820                                if !content_buffer.trim().is_empty() {
1821                                    let _ = tx
1822                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1823                                        .await;
1824                                    emitted_any_content = true;
1825                                }
1826                                content_buffer.clear();
1827
1828                                past_think = true;
1829                                let after = content[tag_start + tag_len..].trim_start_matches('\n');
1830                                content_buffer.push_str(after);
1831                            } else {
1832                                // Still in reasoning block
1833                                content_buffer.push_str(content);
1834                                if content_buffer.len() > 30
1835                                    && (content.contains('\n') || content.contains('.'))
1836                                {
1837                                    let _ = tx
1838                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1839                                        .await;
1840                                    emitted_any_content = true;
1841                                    content_buffer.clear();
1842                                }
1843                            }
1844                        } else {
1845                            // PAST THINK: final answer tokens.
1846                            content_buffer.push_str(content);
1847                            let is_boundary = content.contains(' ')
1848                                || content.contains('.')
1849                                || content.contains('!')
1850                                || content.contains('?');
1851
1852                            if content_buffer.len() > 10 && is_boundary {
1853                                if !emitted_live_status {
1854                                    let _ = tx
1855                                        .send(InferenceEvent::ProviderStatus {
1856                                            state: ProviderRuntimeState::Live,
1857                                            summary: String::new(),
1858                                        })
1859                                        .await;
1860                                    emitted_live_status = true;
1861                                }
1862                                let _ =
1863                                    tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1864                                emitted_any_content = true;
1865                                content_buffer.clear();
1866                            }
1867                        }
1868                    }
1869                }
1870            }
1871        }
1872
1873        // Final Flush
1874        if !content_buffer.is_empty() {
1875            if past_think {
1876                if !emitted_live_status {
1877                    let _ = tx
1878                        .send(InferenceEvent::ProviderStatus {
1879                            state: ProviderRuntimeState::Live,
1880                            summary: String::new(),
1881                        })
1882                        .await;
1883                }
1884                let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1885            } else {
1886                let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1887            }
1888            emitted_any_content = true;
1889        }
1890
1891        if !emitted_any_content {
1892            let _ = tx
1893                .send(InferenceEvent::ProviderStatus {
1894                    state: ProviderRuntimeState::EmptyResponse,
1895                    summary: compact_runtime_failure_summary(
1896                        "empty_model_response",
1897                        "Empty response from model",
1898                    ),
1899                })
1900                .await;
1901            let _ = tx
1902                .send(InferenceEvent::Error(format_runtime_failure_message(
1903                    "Empty response from model",
1904                )))
1905                .await;
1906            let _ = tx.send(InferenceEvent::Done).await;
1907            return Ok(());
1908        }
1909
1910        let _ = tx.send(InferenceEvent::Done).await;
1911        Ok(())
1912    }
1913
1914    /// Single-turn streaming (legacy helper used by startup sequence).
1915    pub async fn stream_generation(
1916        &self,
1917        prompt: &str,
1918        snark: u8,
1919        chaos: u8,
1920        brief: bool,
1921        professional: bool,
1922        tx: mpsc::Sender<InferenceEvent>,
1923    ) -> Result<(), Box<dyn std::error::Error>> {
1924        let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1925        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1926        self.stream_messages(&messages, tx).await
1927    }
1928
1929    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
1930
1931    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
1932    pub async fn generate_task_worker(
1933        &self,
1934        prompt: &str,
1935        professional: bool,
1936    ) -> Result<String, String> {
1937        let current_model = self.current_model();
1938        let model = self
1939            .worker_model
1940            .as_deref()
1941            .unwrap_or(current_model.as_str());
1942        self.generate_task_with_model(prompt, 0.1, professional, model)
1943            .await
1944    }
1945
1946    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1947        self.generate_task_with_temp(prompt, 0.1, professional)
1948            .await
1949    }
1950
1951    pub async fn generate_task_with_temp(
1952        &self,
1953        prompt: &str,
1954        temp: f32,
1955        professional: bool,
1956    ) -> Result<String, String> {
1957        let current_model = self.current_model();
1958        self.generate_task_with_model(prompt, temp, professional, &current_model)
1959            .await
1960    }
1961
1962    pub async fn generate_task_with_model(
1963        &self,
1964        prompt: &str,
1965        temp: f32,
1966        professional: bool,
1967        model: &str,
1968    ) -> Result<String, String> {
1969        let _permit = self
1970            .kv_semaphore
1971            .acquire()
1972            .await
1973            .map_err(|e| e.to_string())?;
1974
1975        let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1976        let request_messages = if should_use_gemma_native_formatting(self, model) {
1977            prepare_gemma_native_messages(&[
1978                ChatMessage::system(&system),
1979                ChatMessage::user(prompt),
1980            ])
1981        } else {
1982            vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1983        };
1984        let request = ChatRequest {
1985            model: model.to_string(),
1986            messages: request_messages,
1987            temperature: temp,
1988            stream: false,
1989            tools: None,
1990        };
1991
1992        preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1993
1994        let res = self
1995            .client
1996            .post(&self.api_url)
1997            .json(&request)
1998            .send()
1999            .await
2000            .map_err(|e| format!("LM Studio request failed: {}", e))?;
2001
2002        let body: ChatResponse = res
2003            .json()
2004            .await
2005            .map_err(|e| format!("Failed to parse response: {}", e))?;
2006
2007        body.choices
2008            .first()
2009            .and_then(|c| c.message.content.clone())
2010            .ok_or_else(|| "Empty response from model".to_string())
2011    }
2012
2013    // ── History management ────────────────────────────────────────────────────
2014
2015    /// Prune middle turns when context grows too large, keeping system + recent N.
2016    #[allow(dead_code)]
2017    pub fn snip_history(
2018        &self,
2019        turns: &[ChatMessage],
2020        max_tokens_estimate: usize,
2021        keep_recent: usize,
2022    ) -> Vec<ChatMessage> {
2023        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
2024        if total_chars / 4 <= max_tokens_estimate {
2025            return turns.to_vec();
2026        }
2027        let keep = keep_recent.min(turns.len());
2028        let mut snipped = vec![turns[0].clone()];
2029        if turns.len() > keep + 1 {
2030            snipped.push(ChatMessage::system(&format!(
2031                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2032                turns.len() - keep - 1
2033            )));
2034            snipped.extend_from_slice(&turns[turns.len() - keep..]);
2035        } else {
2036            snipped = turns.to_vec();
2037        }
2038        snipped
2039    }
2040}
2041
2042fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2043    serde_json::to_vec(value)
2044        .ok()
2045        .map_or(0, |bytes| bytes.len() / 4 + 1)
2046}
2047
2048const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2049
2050fn estimate_message_tokens(message: &ChatMessage) -> usize {
2051    let content_tokens = match &message.content {
2052        MessageContent::Text(s) => s.len() / 4 + 1,
2053        MessageContent::Parts(parts) => parts
2054            .iter()
2055            .map(|part| match part {
2056                ContentPart::Text { text } => text.len() / 4 + 1,
2057                // Image payloads are transported as data URLs, but their base64
2058                // length should not be treated like plain text context pressure.
2059                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2060            })
2061            .sum(),
2062    };
2063    let tool_tokens: usize = message
2064        .tool_calls
2065        .iter()
2066        .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2067        .sum();
2068    content_tokens + tool_tokens + 6
2069}
2070
2071pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2072    messages.iter().map(estimate_message_tokens).sum()
2073}
2074
2075fn reserved_output_tokens(context_length: usize) -> usize {
2076    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2077    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2078}
2079
2080pub fn estimate_prompt_pressure(
2081    messages: &[ChatMessage],
2082    tools: &[ToolDefinition],
2083    context_length: usize,
2084) -> (usize, usize, usize, u8) {
2085    let estimated_input_tokens =
2086        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2087    let reserved_output = reserved_output_tokens(context_length);
2088    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2089    let percent = if context_length == 0 {
2090        0
2091    } else {
2092        ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2093    };
2094    (
2095        estimated_input_tokens,
2096        reserved_output,
2097        estimated_total,
2098        percent,
2099    )
2100}
2101
2102fn preflight_chat_request(
2103    model: &str,
2104    messages: &[ChatMessage],
2105    tools: &[ToolDefinition],
2106    context_length: usize,
2107) -> Result<(), String> {
2108    let (estimated_input_tokens, reserved_output, estimated_total, _) =
2109        estimate_prompt_pressure(messages, tools, context_length);
2110
2111    if estimated_total > context_length {
2112        return Err(format!(
2113            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2114            model, estimated_input_tokens, reserved_output, estimated_total, context_length
2115        ));
2116    }
2117
2118    Ok(())
2119}
2120
2121/// Walk from CWD up to 4 parent directories and collect instruction files.
2122/// Looks for CLAUDE.md, CLAUDE.local.md, and .hematite/instructions.md.
2123/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
2124fn load_instruction_files() -> String {
2125    use std::collections::hash_map::DefaultHasher;
2126    use std::collections::HashSet;
2127    use std::hash::{Hash, Hasher};
2128
2129    let Ok(cwd) = std::env::current_dir() else {
2130        return String::new();
2131    };
2132    let mut result = String::new();
2133    let mut seen: HashSet<u64> = HashSet::new();
2134    let mut total_chars: usize = 0;
2135    const MAX_TOTAL: usize = 12_000;
2136    const MAX_PER_FILE: usize = 4_000;
2137
2138    let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2139
2140    let mut dir = cwd.clone();
2141    for _ in 0..4 {
2142        for name in &candidates {
2143            let path = dir.join(name);
2144            if !path.exists() {
2145                continue;
2146            }
2147            let Ok(content) = std::fs::read_to_string(&path) else {
2148                continue;
2149            };
2150            if content.trim().is_empty() {
2151                continue;
2152            }
2153
2154            let mut hasher = DefaultHasher::new();
2155            content.hash(&mut hasher);
2156            let h = hasher.finish();
2157            if !seen.insert(h) {
2158                continue;
2159            }
2160
2161            let truncated = if content.len() > MAX_PER_FILE {
2162                format!("{}...[truncated]", &content[..MAX_PER_FILE])
2163            } else {
2164                content
2165            };
2166
2167            if total_chars + truncated.len() > MAX_TOTAL {
2168                break;
2169            }
2170            total_chars += truncated.len();
2171            result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2172        }
2173        match dir.parent().map(|p| p.to_owned()) {
2174            Some(p) => dir = p,
2175            None => break,
2176        }
2177    }
2178
2179    if result.is_empty() {
2180        return String::new();
2181    }
2182    format!("\n\n# Project Instructions\n{}", result)
2183}
2184
2185pub fn extract_think_block(text: &str) -> Option<String> {
2186    let lower = text.to_lowercase();
2187
2188    // Official Gemma-4 Native Tags
2189    let open_tag = "<|channel>thought";
2190    let close_tag = "<channel|>";
2191
2192    let start_pos = lower.find(open_tag)?;
2193    let content_start = start_pos + open_tag.len();
2194
2195    let close_pos = lower[content_start..]
2196        .find(close_tag)
2197        .map(|p| content_start + p)
2198        .unwrap_or(text.len());
2199
2200    let content = text[content_start..close_pos].trim();
2201    if content.is_empty() {
2202        None
2203    } else {
2204        Some(content.to_string())
2205    }
2206}
2207
2208pub fn strip_think_blocks(text: &str) -> String {
2209    // Fast-path: strip a stray </think> the model emits at the start when it skips
2210    // the opening tag (common with Qwen after tool calls). Strip it before the lower
2211    // allocation so it can't slip through any branch below.
2212    let text = {
2213        let t = text.trim_start();
2214        if t.to_lowercase().starts_with("</think>") {
2215            &t[8..]
2216        } else {
2217            text
2218        }
2219    };
2220
2221    let lower = text.to_lowercase();
2222
2223    // Use the official Gemma-4 closing tag — answer is everything after it.
2224    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2225        let answer = text[end..]
2226            .replace("<|channel>thought", "")
2227            .replace("<channel|>", "");
2228        return answer.trim().replace("\n\n\n", "\n\n").to_string();
2229    }
2230
2231    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
2232    let first_open = [
2233        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
2234        lower.find("<think>"),
2235        lower.find("<thought>"),
2236        lower.find("<|think|>"),
2237    ]
2238    .iter()
2239    .filter_map(|&x| x)
2240    .min();
2241
2242    if let Some(start) = first_open {
2243        if start > 0 {
2244            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2245        }
2246        return String::new();
2247    }
2248
2249    // If the model outputs 'naked' reasoning without tags:
2250    // Strip leading sentences like "The user asked..." or "I should present..."
2251    // if they appear before actual answer content.
2252    let naked_reasoning_phrases: &[&str] = &[
2253        "the user asked",
2254        "the user is asking",
2255        "the user wants",
2256        "i will structure",
2257        "i should provide",
2258        "i should give",
2259        "i should avoid",
2260        "i should note",
2261        "i should focus",
2262        "i should keep",
2263        "i should respond",
2264        "i should present",
2265        "i should display",
2266        "i should show",
2267        "i need to",
2268        "i can see from",
2269        "without being overly",
2270        "let me ",
2271        "necessary information in my identity",
2272        "was computed successfully",
2273        "computed successfully",
2274    ];
2275    let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2276    if is_naked_reasoning {
2277        let lines: Vec<&str> = text.lines().collect();
2278        if !lines.is_empty() {
2279            // Skip leading lines that are themselves reasoning prose or blank.
2280            // Stop skipping at the first line that looks like real answer content.
2281            let mut start_idx = 0;
2282            for (i, line) in lines.iter().enumerate() {
2283                let l = line.to_lowercase();
2284                let is_reasoning_line =
2285                    naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2286                if is_reasoning_line {
2287                    start_idx = i + 1;
2288                } else {
2289                    break;
2290                }
2291            }
2292            if start_idx < lines.len() {
2293                return lines[start_idx..]
2294                    .join("\n")
2295                    .trim()
2296                    .replace("\n\n\n", "\n\n")
2297                    .to_string();
2298            }
2299            // Entire response was reasoning prose — return empty.
2300            return String::new();
2301        }
2302    }
2303
2304    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
2305    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
2306    let cleaned = strip_xml_tool_call_artifacts(text);
2307    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2308}
2309
2310/// Remove stray XML tool-call closing/opening tags that local models occasionally
2311/// leak into visible output when they start-then-abandon a tool call.
2312fn strip_xml_tool_call_artifacts(text: &str) -> String {
2313    // Tags to remove (both open and close forms, case-insensitive).
2314    const XML_ARTIFACTS: &[&str] = &[
2315        "</tool_call>",
2316        "<tool_call>",
2317        "</function>",
2318        "<function>",
2319        "</parameter>",
2320        "<parameter>",
2321        "</arguments>",
2322        "<arguments>",
2323        "</tool_use>",
2324        "<tool_use>",
2325        "</invoke>",
2326        "<invoke>",
2327        // Stray think/reasoning closing tags that leak after block extraction.
2328        "</think>",
2329        "</thought>",
2330        "</thinking>",
2331    ];
2332    let mut out = text.to_string();
2333    for tag in XML_ARTIFACTS {
2334        // Case-insensitive replace
2335        while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2336            out.drain(pos..pos + tag.len());
2337        }
2338    }
2339    // Collapse any blank lines left behind
2340    out
2341}
2342
2343/// Extract native Gemma-4 <|tool_call|> tags from text.
2344/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
2345pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2346    use regex::Regex;
2347    let mut results = Vec::new();
2348
2349    // -- Format 1: Gemma 4 Native (call:name{args}) --
2350    let re_call = Regex::new(
2351        r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2352    ).unwrap();
2353    let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2354
2355    for cap in re_call.captures_iter(text) {
2356        let name = cap[1].to_string();
2357        let args_str = &cap[2];
2358        let mut arguments = serde_json::Map::new();
2359
2360        for arg_cap in re_arg.captures_iter(args_str) {
2361            let key = arg_cap[1].to_string();
2362            let val_raw = arg_cap
2363                .get(2)
2364                .map(|m| m.as_str())
2365                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2366                .unwrap_or("")
2367                .trim();
2368            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2369
2370            let val = if normalized_raw == "true" {
2371                Value::Bool(true)
2372            } else if normalized_raw == "false" {
2373                Value::Bool(false)
2374            } else if let Ok(n) = normalized_raw.parse::<i64>() {
2375                Value::Number(n.into())
2376            } else if let Ok(n) = normalized_raw.parse::<u64>() {
2377                Value::Number(n.into())
2378            } else if let Ok(n) = normalized_raw.parse::<f64>() {
2379                serde_json::Number::from_f64(n)
2380                    .map(Value::Number)
2381                    .unwrap_or(Value::String(normalized_raw.clone()))
2382            } else {
2383                Value::String(normalized_raw)
2384            };
2385
2386            arguments.insert(key, val);
2387        }
2388
2389        results.push(ToolCallResponse {
2390            id: format!("call_{}", rand::random::<u32>()),
2391            call_type: "function".to_string(),
2392            function: ToolCallFn {
2393                name,
2394                arguments: Value::Object(arguments).to_string(),
2395            },
2396        });
2397    }
2398
2399    // -- Format 2: XML (Qwen/Claude style) --
2400    let re_xml_call = Regex::new(
2401        r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
2402    ).unwrap();
2403    let re_xml_param =
2404        Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
2405
2406    for cap in re_xml_call.captures_iter(text) {
2407        let name = cap[1].to_string();
2408        let body = &cap[2];
2409        let mut arguments = serde_json::Map::new();
2410
2411        for p_cap in re_xml_param.captures_iter(body) {
2412            let key = p_cap[1].to_string();
2413            let val_raw = p_cap[2].trim();
2414            let val = if val_raw == "true" {
2415                Value::Bool(true)
2416            } else if val_raw == "false" {
2417                Value::Bool(false)
2418            } else if let Ok(n) = val_raw.parse::<i64>() {
2419                Value::Number(n.into())
2420            } else if let Ok(n) = val_raw.parse::<u64>() {
2421                Value::Number(n.into())
2422            } else {
2423                Value::String(val_raw.to_string())
2424            };
2425            arguments.insert(key, val);
2426        }
2427
2428        if !arguments.is_empty() || name == "clear_context" {
2429            results.push(ToolCallResponse {
2430                id: format!("call_{}", rand::random::<u32>()),
2431                call_type: "function".to_string(),
2432                function: ToolCallFn {
2433                    name,
2434                    arguments: Value::Object(arguments).to_string(),
2435                },
2436            });
2437        }
2438    }
2439
2440    results
2441}
2442
2443pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2444    let trimmed = raw.trim();
2445    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2446
2447    let mut value = match serde_json::from_str::<Value>(&candidate) {
2448        Ok(v) => v,
2449        Err(_) => return candidate,
2450    };
2451    normalize_tool_argument_value(tool_name, &mut value);
2452    value.to_string()
2453}
2454
2455fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2456    match value {
2457        Value::String(s) => *s = normalize_string_arg(s),
2458        Value::Array(items) => {
2459            for item in items {
2460                normalize_tool_argument_value(tool_name, item);
2461            }
2462        }
2463        Value::Object(map) => {
2464            for val in map.values_mut() {
2465                normalize_tool_argument_value(tool_name, val);
2466            }
2467            if tool_name == "grep_files" {
2468                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2469                    *pattern = normalize_regex_pattern(pattern);
2470                }
2471            }
2472            for key in ["path", "extension", "query", "command", "reason"] {
2473                if let Some(Value::String(s)) = map.get_mut(key) {
2474                    *s = normalize_string_arg(s);
2475                }
2476            }
2477        }
2478        _ => {}
2479    }
2480}
2481
2482fn unwrap_json_string_once(input: &str) -> Option<String> {
2483    if input.len() < 2 {
2484        return None;
2485    }
2486    let first = input.chars().next()?;
2487    let last = input.chars().last()?;
2488    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2489        return None;
2490    }
2491    let inner = &input[1..input.len() - 1];
2492    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2493    Some(unescaped.trim().to_string())
2494}
2495
2496fn normalize_string_arg(input: &str) -> String {
2497    let mut out = input.trim().to_string();
2498    while out.len() >= 2 {
2499        let mut changed = false;
2500        for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2501            if out.starts_with(start) && out.ends_with(end) {
2502                out = out[start.len()..out.len() - end.len()].trim().to_string();
2503                changed = true;
2504                break;
2505            }
2506        }
2507        if !changed {
2508            break;
2509        }
2510    }
2511    out
2512}
2513
2514fn normalize_regex_pattern(input: &str) -> String {
2515    let out = normalize_string_arg(input);
2516    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2517        out[1..out.len() - 1].to_string()
2518    } else {
2519        out
2520    }
2521}
2522
2523fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2524    let mut system_blocks = Vec::new();
2525    let mut prepared = Vec::new();
2526    let mut seeded = false;
2527
2528    for message in messages {
2529        if message.role == "system" {
2530            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2531                .trim()
2532                .to_string();
2533            if !cleaned.is_empty() {
2534                system_blocks.push(cleaned);
2535            }
2536            continue;
2537        }
2538
2539        let mut clone = message.clone();
2540        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2541
2542        if !seeded && message.role == "user" {
2543            let mut merged = String::new();
2544            if !system_blocks.is_empty() {
2545                merged.push_str("System instructions for this turn:\n");
2546                merged.push_str(&system_blocks.join("\n\n"));
2547                merged.push_str("\n\n");
2548            }
2549            merged.push_str(clone.content.as_str());
2550            clone.content = MessageContent::Text(merged);
2551            seeded = true;
2552        }
2553
2554        prepared.push(clone);
2555    }
2556
2557    if !seeded && !system_blocks.is_empty() {
2558        prepared.insert(
2559            0,
2560            ChatMessage::user(&format!(
2561                "System instructions for this turn:\n{}",
2562                system_blocks.join("\n\n")
2563            )),
2564        );
2565    }
2566
2567    prepared
2568}
2569
2570fn strip_legacy_turn_wrappers(text: &str) -> String {
2571    text.replace("<|turn>system\n", "")
2572        .replace("<|turn>user\n", "")
2573        .replace("<|turn>assistant\n", "")
2574        .replace("<|turn>tool\n", "")
2575        .replace("<turn|>", "")
2576        .trim()
2577        .to_string()
2578}
2579
2580pub fn strip_native_tool_call_text(text: &str) -> String {
2581    use regex::Regex;
2582    // Format 1: Gemma 4 Native
2583    let re_call = Regex::new(
2584        r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2585    ).unwrap();
2586    // Format 2: XML (Qwen/Claude style)
2587    let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
2588    let re_response =
2589        Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2590            .unwrap();
2591    let without_calls = re_call.replace_all(text, "");
2592    let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
2593    re_response
2594        .replace_all(without_xml.as_ref(), "")
2595        .trim()
2596        .to_string()
2597}
2598
2599#[cfg(test)]
2600mod tests {
2601    use super::*;
2602
2603    #[test]
2604    fn system_prompt_includes_running_hematite_version() {
2605        let engine = InferenceEngine::new(
2606            "http://localhost:1234/v1".to_string(),
2607            "strategist".to_string(),
2608            0,
2609        )
2610        .expect("engine");
2611
2612        let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2613        assert!(system.contains(crate::HEMATITE_VERSION));
2614    }
2615
2616    #[test]
2617    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2618        let text = r#"<|channel>thought
2619Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2620<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2621
2622        let calls = extract_native_tool_calls(text);
2623        assert_eq!(calls.len(), 1);
2624        assert_eq!(calls[0].function.name, "read_file");
2625
2626        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2627        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2628        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2629        assert_eq!(
2630            args.get("path").and_then(|v| v.as_str()),
2631            Some("src/ui/tui.rs")
2632        );
2633
2634        let stripped = strip_native_tool_call_text(text);
2635        assert!(!stripped.contains("<|tool_call"));
2636        assert!(!stripped.contains("<tool_call|>"));
2637    }
2638
2639    #[test]
2640    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2641        let text = r#"<|channel>thought
2642Planning.
2643<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2644Mapped src.
2645<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2646Read main.
2647<channel|>"#;
2648
2649        let calls = extract_native_tool_calls(text);
2650        assert_eq!(calls.len(), 2);
2651        assert_eq!(calls[0].function.name, "list_files");
2652        assert_eq!(calls[1].function.name, "read_file");
2653
2654        let stripped = strip_native_tool_call_text(text);
2655        assert!(!stripped.contains("<|tool_call"));
2656        assert!(!stripped.contains("<|tool_response"));
2657        assert!(!stripped.contains("<tool_response|>"));
2658    }
2659
2660    #[test]
2661    fn extracts_qwen_xml_tool_calls_from_reasoning() {
2662        let text = r#"Based on the project structure, I need to check the binary.
2663<tool_call>
2664<function=shell>
2665<parameter=command>
2666ls -la hematite.exe
2667</parameter>
2668<parameter=reason>
2669Check if the binary exists
2670</parameter>
2671</function>
2672</tool_call>"#;
2673
2674        let calls = extract_native_tool_calls(text);
2675        assert_eq!(calls.len(), 1);
2676        assert_eq!(calls[0].function.name, "shell");
2677
2678        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2679        assert_eq!(
2680            args.get("command").and_then(|v| v.as_str()),
2681            Some("ls -la hematite.exe")
2682        );
2683        assert_eq!(
2684            args.get("reason").and_then(|v| v.as_str()),
2685            Some("Check if the binary exists")
2686        );
2687
2688        let stripped = strip_native_tool_call_text(text);
2689        assert!(!stripped.contains("<tool_call>"));
2690        assert!(!stripped.contains("<function=shell>"));
2691    }
2692}