Skip to main content

hematite/agent/
inference.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7// ── Engine ────────────────────────────────────────────────────────────────────
8
9pub struct InferenceEngine {
10    pub client: reqwest::Client,
11    pub api_url: String,
12    /// Root URL of the LLM provider (e.g. `http://localhost:1234`).
13    /// All non-completions endpoints (models list, health, embeddings) are derived from this.
14    pub base_url: String,
15    pub species: String,
16    pub snark: u8,
17    pub kv_semaphore: Semaphore,
18    /// The model ID currently loaded in LM Studio (auto-detected on boot).
19    pub model: std::sync::RwLock<String>,
20    /// Context window length in tokens (auto-detected from LM Studio, default 32768).
21    pub context_length: std::sync::atomic::AtomicUsize,
22    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23    /// Optional model ID for worker-level tasks (Swarms / research).
24    pub worker_model: Option<String>,
25    /// Opt-in Gemma-native request shaping. Off by default.
26    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27    /// Global cancellation token for hard-interrupting the inference stream.
28    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32    let lower = model.to_ascii_lowercase();
33    lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37    is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40// ── OpenAI Tool Definition ────────────────────────────────────────────────────
41
42#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44    #[serde(rename = "type")]
45    pub tool_type: String,
46    pub function: ToolFunction,
47    #[serde(skip_serializing, skip_deserializing)]
48    pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53    pub name: String,
54    pub description: String,
55    pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60    RepoRead,
61    RepoWrite,
62    Runtime,
63    Architecture,
64    Toolchain,
65    Verification,
66    Git,
67    Research,
68    Vision,
69    Lsp,
70    Workflow,
71    External,
72    Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77    pub category: ToolCategory,
78    pub mutates_workspace: bool,
79    pub external_surface: bool,
80    pub trust_sensitive: bool,
81    pub read_only_friendly: bool,
82    pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86    if name.starts_with("mcp__") {
87        let lower = name.to_ascii_lowercase();
88        let mutates_workspace = [
89            "__edit",
90            "__write",
91            "__create",
92            "__move",
93            "__delete",
94            "__remove",
95            "__rename",
96            "__replace",
97            "__patch",
98        ]
99        .iter()
100        .any(|needle| lower.contains(needle));
101        return ToolMetadata {
102            category: ToolCategory::External,
103            mutates_workspace,
104            external_surface: true,
105            trust_sensitive: true,
106            read_only_friendly: !mutates_workspace,
107            plan_scope: false,
108        };
109    }
110
111    match name {
112        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113            category: ToolCategory::RepoRead,
114            mutates_workspace: false,
115            external_surface: false,
116            trust_sensitive: false,
117            read_only_friendly: true,
118            plan_scope: true,
119        },
120        "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121            category: ToolCategory::RepoWrite,
122            mutates_workspace: true,
123            external_surface: false,
124            trust_sensitive: true,
125            read_only_friendly: false,
126            plan_scope: true,
127        },
128        "trace_runtime_flow" => ToolMetadata {
129            category: ToolCategory::Architecture,
130            mutates_workspace: false,
131            external_surface: false,
132            trust_sensitive: false,
133            read_only_friendly: true,
134            plan_scope: false,
135        },
136        "describe_toolchain" => ToolMetadata {
137            category: ToolCategory::Toolchain,
138            mutates_workspace: false,
139            external_surface: false,
140            trust_sensitive: false,
141            read_only_friendly: true,
142            plan_scope: false,
143        },
144        "shell" => ToolMetadata {
145            category: ToolCategory::Runtime,
146            mutates_workspace: true,
147            external_surface: false,
148            trust_sensitive: true,
149            read_only_friendly: false,
150            plan_scope: false,
151        },
152        "inspect_host" => ToolMetadata {
153            category: ToolCategory::Runtime,
154            mutates_workspace: false,
155            external_surface: false,
156            trust_sensitive: false,
157            read_only_friendly: true,
158            plan_scope: false,
159        },
160        "resolve_host_issue" => ToolMetadata {
161            category: ToolCategory::Runtime,
162            mutates_workspace: true,
163            external_surface: true,
164            trust_sensitive: true,
165            read_only_friendly: false,
166            plan_scope: false,
167        },
168        "run_hematite_maintainer_workflow" => ToolMetadata {
169            category: ToolCategory::Workflow,
170            mutates_workspace: true,
171            external_surface: false,
172            trust_sensitive: true,
173            read_only_friendly: false,
174            plan_scope: false,
175        },
176        "run_workspace_workflow" => ToolMetadata {
177            category: ToolCategory::Workflow,
178            mutates_workspace: true,
179            external_surface: false,
180            trust_sensitive: true,
181            read_only_friendly: false,
182            plan_scope: false,
183        },
184        "verify_build" => ToolMetadata {
185            category: ToolCategory::Verification,
186            mutates_workspace: false,
187            external_surface: false,
188            trust_sensitive: false,
189            read_only_friendly: true,
190            plan_scope: false,
191        },
192        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
193            ToolMetadata {
194                category: ToolCategory::Git,
195                mutates_workspace: true,
196                external_surface: false,
197                trust_sensitive: true,
198                read_only_friendly: false,
199                plan_scope: false,
200            }
201        }
202        "research_web" | "fetch_docs" => ToolMetadata {
203            category: ToolCategory::Research,
204            mutates_workspace: false,
205            external_surface: false,
206            trust_sensitive: false,
207            read_only_friendly: true,
208            plan_scope: false,
209        },
210        "vision_analyze" => ToolMetadata {
211            category: ToolCategory::Vision,
212            mutates_workspace: false,
213            external_surface: false,
214            trust_sensitive: false,
215            read_only_friendly: true,
216            plan_scope: false,
217        },
218        "lsp_definitions"
219        | "lsp_references"
220        | "lsp_hover"
221        | "lsp_rename_symbol"
222        | "lsp_get_diagnostics"
223        | "lsp_search_symbol" => ToolMetadata {
224            category: ToolCategory::Lsp,
225            mutates_workspace: false,
226            external_surface: false,
227            trust_sensitive: false,
228            read_only_friendly: true,
229            plan_scope: false,
230        },
231        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
232            category: ToolCategory::Workflow,
233            mutates_workspace: false,
234            external_surface: false,
235            trust_sensitive: false,
236            read_only_friendly: true,
237            plan_scope: true,
238        },
239        "manage_tasks" => ToolMetadata {
240            category: ToolCategory::Workflow,
241            mutates_workspace: false,
242            external_surface: false,
243            trust_sensitive: false,
244            read_only_friendly: true,
245            plan_scope: false,
246        },
247        _ => ToolMetadata {
248            category: ToolCategory::Other,
249            mutates_workspace: false,
250            external_surface: false,
251            trust_sensitive: false,
252            read_only_friendly: true,
253            plan_scope: false,
254        },
255    }
256}
257
258// ── Message types ─────────────────────────────────────────────────────────────
259
260/// OpenAI-compatible chat message. Content can be a string (legacy) or a
261/// Vec of ContentPart (multimodal).
262#[derive(Serialize, Deserialize, Clone, Debug)]
263pub struct ChatMessage {
264    pub role: String,
265    /// Support both simple string content and complex multi-part content (Vision).
266    pub content: MessageContent,
267    /// Assistant messages may have tool calls. Default to empty vec, not null.
268    #[serde(default, skip_serializing_if = "Vec::is_empty")]
269    pub tool_calls: Vec<ToolCallResponse>,
270    /// Tool message references the original call.
271    #[serde(skip_serializing_if = "Option::is_none")]
272    pub tool_call_id: Option<String>,
273    /// Tool message name.
274    #[serde(skip_serializing_if = "Option::is_none")]
275    pub name: Option<String>,
276}
277
278#[derive(Serialize, Deserialize, Clone, Debug)]
279#[serde(untagged)]
280pub enum MessageContent {
281    Text(String),
282    Parts(Vec<ContentPart>),
283}
284
285#[derive(Serialize, Deserialize, Clone, Debug)]
286#[serde(tag = "type")]
287pub enum ContentPart {
288    #[serde(rename = "text")]
289    Text { text: String },
290    #[serde(rename = "image_url")]
291    ImageUrl { image_url: ImageUrlSource },
292}
293
294#[derive(Serialize, Deserialize, Clone, Debug)]
295pub struct ImageUrlSource {
296    pub url: String,
297}
298
299impl Default for MessageContent {
300    fn default() -> Self {
301        MessageContent::Text(String::new())
302    }
303}
304
305impl MessageContent {
306    pub fn as_str(&self) -> &str {
307        match self {
308            MessageContent::Text(s) => s,
309            MessageContent::Parts(parts) => {
310                for part in parts {
311                    if let ContentPart::Text { text } = part {
312                        return text;
313                    }
314                }
315                ""
316            }
317        }
318    }
319}
320
321impl ChatMessage {
322    pub fn system(content: &str) -> Self {
323        Self {
324            role: "system".into(),
325            content: MessageContent::Text(content.into()),
326            tool_calls: Vec::new(),
327            tool_call_id: None,
328            name: None,
329        }
330    }
331    pub fn user(content: &str) -> Self {
332        Self {
333            role: "user".into(),
334            content: MessageContent::Text(content.into()),
335            tool_calls: Vec::new(),
336            tool_call_id: None,
337            name: None,
338        }
339    }
340    pub fn user_with_image(text: &str, image_url: &str) -> Self {
341        let mut text_parts = text.to_string();
342        if !text_parts.contains("<|image|>") {
343            text_parts.push_str(" <|image|>");
344        }
345        Self {
346            role: "user".into(),
347            content: MessageContent::Parts(vec![
348                ContentPart::Text { text: text_parts },
349                ContentPart::ImageUrl {
350                    image_url: ImageUrlSource {
351                        url: image_url.into(),
352                    },
353                },
354            ]),
355            tool_calls: Vec::new(),
356            tool_call_id: None,
357            name: None,
358        }
359    }
360    pub fn assistant_text(content: &str) -> Self {
361        Self {
362            role: "assistant".into(),
363            content: MessageContent::Text(content.into()),
364            tool_calls: Vec::new(),
365            tool_call_id: None,
366            name: None,
367        }
368    }
369    pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
370        Self {
371            role: "assistant".into(),
372            content: MessageContent::Text(content.into()),
373            tool_calls: calls,
374            tool_call_id: None,
375            name: None,
376        }
377    }
378    pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
379        Self::tool_result_for_model(tool_call_id, fn_name, content, "")
380    }
381
382    /// Build a tool result message, applying Gemma 4 native markup only when the
383    /// loaded model is actually a Gemma 4 model.
384    pub fn tool_result_for_model(
385        tool_call_id: &str,
386        fn_name: &str,
387        content: &str,
388        model: &str,
389    ) -> Self {
390        let body = if is_gemma4_model_name(model) {
391            format!(
392                "<|tool_response>response:{}{}{}<tool_response|>",
393                fn_name, "{", content
394            )
395        } else {
396            content.to_string()
397        };
398        Self {
399            role: "tool".into(),
400            content: MessageContent::Text(body),
401            tool_calls: Vec::new(),
402            tool_call_id: Some(tool_call_id.into()),
403            name: Some(fn_name.into()),
404        }
405    }
406}
407
408// ── Tool call as returned by the model ───────────────────────────────────────
409
410#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallResponse {
412    pub id: String,
413    #[serde(rename = "type")]
414    pub call_type: String,
415    pub function: ToolCallFn,
416}
417
418#[derive(Serialize, Deserialize, Clone, Debug)]
419pub struct ToolCallFn {
420    pub name: String,
421    /// JSON-encoded arguments string (as returned by the API).
422    pub arguments: String,
423}
424
425// ── HTTP request / response shapes ───────────────────────────────────────────
426
427#[derive(Serialize)]
428struct ChatRequest {
429    model: String,
430    messages: Vec<ChatMessage>,
431    temperature: f32,
432    stream: bool,
433    #[serde(skip_serializing_if = "Option::is_none")]
434    tools: Option<Vec<ToolDefinition>>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ChatResponse {
439    choices: Vec<ResponseChoice>,
440    usage: Option<TokenUsage>,
441}
442
443#[derive(Deserialize, Debug, Clone)]
444pub struct TokenUsage {
445    pub prompt_tokens: usize,
446    pub completion_tokens: usize,
447    pub total_tokens: usize,
448    #[serde(default)]
449    pub prompt_cache_hit_tokens: usize,
450    #[serde(default)]
451    pub cache_read_input_tokens: usize,
452}
453
454#[derive(Deserialize, Debug)]
455struct ResponseChoice {
456    message: ResponseMessage,
457    #[serde(default)]
458    finish_reason: Option<String>,
459}
460
461#[derive(Deserialize, Debug)]
462struct ResponseMessage {
463    content: Option<String>,
464    tool_calls: Option<Vec<ToolCallResponse>>,
465    /// LM Studio routes Qwen3 thinking-mode output here instead of wrapping
466    /// it in <think> tags inside `content`. When tool calls are generated
467    /// inside a think block, they end up here rather than in `tool_calls`.
468    #[serde(default)]
469    reasoning_content: Option<String>,
470}
471
472const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
473const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
474
475fn is_tiny_context_window(context_length: usize) -> bool {
476    context_length <= 8_192
477}
478
479fn is_compact_context_window(context_length: usize) -> bool {
480    context_length > 8_192 && context_length <= 49_152
481}
482
483pub fn is_compact_context_window_pub(context_length: usize) -> bool {
484    is_compact_context_window(context_length)
485}
486
487fn is_provider_context_limit_detail(lower: &str) -> bool {
488    (lower.contains("n_keep") && lower.contains("n_ctx"))
489        || lower.contains("context length")
490        || lower.contains("keep from the initial prompt")
491        || lower.contains("prompt is greater than the context length")
492        || lower.contains("exceeds the context window")
493}
494
495fn classify_runtime_failure_tag(detail: &str) -> &'static str {
496    let lower = detail.to_ascii_lowercase();
497    if lower.contains("context_window_blocked")
498        || lower.contains("context ceiling reached")
499        || lower.contains("exceeds the")
500        || is_provider_context_limit_detail(&lower)
501    {
502        "context_window"
503    } else if lower.contains("empty response from model")
504        || lower.contains("model returned an empty response")
505    {
506        "empty_model_response"
507    } else if lower.contains("action blocked:")
508        || lower.contains("access denied")
509        || lower.contains("declined by user")
510    {
511        "tool_policy_blocked"
512    } else {
513        "provider_degraded"
514    }
515}
516
517fn runtime_failure_guidance(tag: &str) -> &'static str {
518    match tag {
519        "context_window" => {
520            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
521        }
522        "empty_model_response" => {
523            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
524        }
525        "tool_policy_blocked" => {
526            "Stay inside the allowed workflow or switch modes before retrying."
527        }
528        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
529    }
530}
531
532fn format_runtime_failure_message(detail: &str) -> String {
533    let tag = classify_runtime_failure_tag(detail);
534    format!(
535        "[failure:{}] {} Detail: {}",
536        tag,
537        runtime_failure_guidance(tag),
538        detail.trim()
539    )
540}
541
542#[derive(Debug, Clone, Copy, PartialEq, Eq)]
543pub enum ProviderRuntimeState {
544    Booting,
545    Live,
546    Recovering,
547    Degraded,
548    ContextWindow,
549    EmptyResponse,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum McpRuntimeState {
554    Unconfigured,
555    Healthy,
556    Degraded,
557    Failed,
558}
559
560#[derive(Debug, Clone, Copy, PartialEq, Eq)]
561pub enum OperatorCheckpointState {
562    Idle,
563    RecoveringProvider,
564    BudgetReduced,
565    HistoryCompacted,
566    BlockedContextWindow,
567    BlockedPolicy,
568    BlockedRecentFileEvidence,
569    BlockedExactLineWindow,
570    BlockedToolLoop,
571    BlockedVerification,
572}
573
574impl OperatorCheckpointState {
575    pub fn label(self) -> &'static str {
576        match self {
577            OperatorCheckpointState::Idle => "idle",
578            OperatorCheckpointState::RecoveringProvider => "recovering_provider",
579            OperatorCheckpointState::BudgetReduced => "budget_reduced",
580            OperatorCheckpointState::HistoryCompacted => "history_compacted",
581            OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
582            OperatorCheckpointState::BlockedPolicy => "blocked_policy",
583            OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
584            OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
585            OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
586            OperatorCheckpointState::BlockedVerification => "blocked_verification",
587        }
588    }
589}
590
591fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
592    match tag {
593        "context_window" => ProviderRuntimeState::ContextWindow,
594        "empty_model_response" => ProviderRuntimeState::EmptyResponse,
595        _ => ProviderRuntimeState::Degraded,
596    }
597}
598
599fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
600    match tag {
601        "context_window" => {
602            "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
603                .to_string()
604        }
605        "empty_model_response" => {
606            "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
607                .to_string()
608        }
609        "tool_policy_blocked" => {
610            "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
611                .to_string()
612        }
613        _ => {
614            let mut excerpt = detail
615                .split_whitespace()
616                .take(12)
617                .collect::<Vec<_>>()
618                .join(" ");
619            if excerpt.len() > 110 {
620                excerpt.truncate(110);
621                excerpt.push_str("...");
622            }
623            if excerpt.is_empty() {
624                "LM Studio degraded; Hematite will retry once before surfacing a failure."
625                    .to_string()
626            } else {
627                format!("LM Studio degraded: {}", excerpt)
628            }
629        }
630    }
631}
632
633// ── Events pushed to the TUI ──────────────────────────────────────────────────
634
635#[derive(Debug)]
636pub enum InferenceEvent {
637    /// A text token to append to the current assistant message.
638    Token(String),
639    /// A text token to be displayed on screen but NOT spoken (e.g. startup greeting).
640    MutedToken(String),
641    /// Internal model reasoning (shown in side panel, not dialogue).
642    Thought(String),
643    /// Critical diagnostic feedback from the voice synthesis engine.
644    VoiceStatus(String),
645    /// A tool call is starting – show a status line in the TUI.
646    ToolCallStart {
647        id: String,
648        name: String,
649        args: String,
650    },
651    /// A tool call completed – show result in the TUI.
652    ToolCallResult {
653        id: String,
654        name: String,
655        output: String,
656        is_error: bool,
657    },
658    /// A risky tool requires explicit user approval.
659    /// The TUI must send `true` (approved) or `false` (rejected) via `responder`.
660    /// When `diff` is Some, the modal renders a coloured before/after diff preview.
661    ApprovalRequired {
662        id: String,
663        name: String,
664        display: String,
665        /// Pre-formatted diff: lines starting with "- " are removals, "+ " are additions,
666        /// "---" is a file header.  None means a plain high-risk approval (no diff).
667        diff: Option<String>,
668        /// Intent label for mutation protocol (Cyan box trigger).
669        mutation_label: Option<String>,
670        responder: tokio::sync::oneshot::Sender<bool>,
671    },
672    /// The current agent turn is complete.
673    Done,
674    /// An error occurred during inference.
675    Error(String),
676    /// Compact provider/runtime state for the operator surface.
677    ProviderStatus {
678        state: ProviderRuntimeState,
679        summary: String,
680    },
681    /// Typed operator checkpoint/blocker state for SPECULAR and recovery UIs.
682    OperatorCheckpoint {
683        state: OperatorCheckpointState,
684        summary: String,
685    },
686    /// Typed recovery recipe summary for operator/debug surfaces.
687    RecoveryRecipe { summary: String },
688    /// Compact MCP/runtime server health for the operator surface.
689    McpStatus {
690        state: McpRuntimeState,
691        summary: String,
692    },
693    /// Current compaction pressure against the adaptive threshold.
694    CompactionPressure {
695        estimated_tokens: usize,
696        threshold_tokens: usize,
697        percent: u8,
698    },
699    /// Current total prompt-budget pressure against the live context window.
700    PromptPressure {
701        estimated_input_tokens: usize,
702        reserved_output_tokens: usize,
703        estimated_total_tokens: usize,
704        context_length: usize,
705        percent: u8,
706    },
707    /// A generic task progress update (e.g. for single-agent tool execution).
708    TaskProgress {
709        id: String,
710        label: String,
711        progress: u8,
712    },
713    /// Real-time token usage update from the API.
714    UsageUpdate(TokenUsage),
715    /// The current runtime profile detected from LM Studio.
716    RuntimeProfile {
717        model_id: String,
718        context_length: usize,
719    },
720    /// Vein index status after each incremental re-index.
721    VeinStatus {
722        file_count: usize,
723        embedded_count: usize,
724        docs_only: bool,
725    },
726    /// File paths the Vein surfaced as relevant to the current turn.
727    /// Used to populate ACTIVE CONTEXT with retrieval results.
728    VeinContext { paths: Vec<String> },
729    /// A new companion was hatched mid-session via /reroll.
730    SoulReroll {
731        species: String,
732        rarity: String,
733        shiny: bool,
734        personality: String,
735    },
736    /// A "Dive-In" command (cd <dir> && hematite) to be copied to the clipboard.
737    CopyDiveInCommand(String),
738    /// Embed model loaded/unloaded mid-session.
739    EmbedProfile { model_id: Option<String> },
740    /// A single line of live shell output, streamed while the command runs.
741    /// Displayed in the SPECULAR panel so the operator sees progress without
742    /// waiting for the full command to finish.
743    ShellLine(String),
744}
745
746// ── Engine implementation ─────────────────────────────────────────────────────
747
748impl InferenceEngine {
749    pub fn new(
750        api_url: String,
751        species: String,
752        snark: u8,
753    ) -> Result<Self, Box<dyn std::error::Error>> {
754        let client = reqwest::Client::builder()
755            .timeout(std::time::Duration::from_secs(180))
756            .build()?;
757
758        // Extract http://host:port as the base for all non-completions endpoints.
759        let base_url = {
760            let trimmed = api_url.trim_end_matches('/');
761            if let Some(scheme_end) = trimmed.find("://") {
762                let after_scheme = &trimmed[scheme_end + 3..];
763                if let Some(path_start) = after_scheme.find('/') {
764                    format!(
765                        "{}://{}",
766                        &trimmed[..scheme_end],
767                        &after_scheme[..path_start]
768                    )
769                } else {
770                    trimmed.to_string()
771                }
772            } else {
773                trimmed.to_string()
774            }
775        };
776
777        let api_url = if api_url.ends_with("/chat/completions") {
778            api_url
779        } else if api_url.ends_with("/") {
780            format!("{}chat/completions", api_url)
781        } else {
782            format!("{}/chat/completions", api_url)
783        };
784
785        Ok(Self {
786            client,
787            api_url,
788            base_url,
789            species,
790            snark,
791            kv_semaphore: Semaphore::new(3),
792            model: std::sync::RwLock::new(String::new()),
793            context_length: std::sync::atomic::AtomicUsize::new(32_768), // Gemma-4 Sweet Spot (32K)
794            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
795            worker_model: None,
796            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
797            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
798        })
799    }
800
801    pub fn set_gemma_native_formatting(&self, enabled: bool) {
802        self.gemma_native_formatting
803            .store(enabled, std::sync::atomic::Ordering::SeqCst);
804    }
805
806    pub fn gemma_native_formatting_enabled(&self) -> bool {
807        self.gemma_native_formatting
808            .load(std::sync::atomic::Ordering::SeqCst)
809    }
810
811    pub fn current_model(&self) -> String {
812        self.model.read().map(|g| g.clone()).unwrap_or_default()
813    }
814
815    pub fn current_context_length(&self) -> usize {
816        self.context_length
817            .load(std::sync::atomic::Ordering::SeqCst)
818    }
819
820    pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
821        if let Ok(mut guard) = self.model.write() {
822            *guard = model.to_string();
823        }
824        self.context_length
825            .store(context_length, std::sync::atomic::Ordering::SeqCst);
826    }
827
828    /// Returns true if LM Studio is reachable.
829    pub async fn health_check(&self) -> bool {
830        let url = format!("{}/v1/models", self.base_url);
831        match self.client.get(&url).send().await {
832            Ok(resp) => resp.status().is_success(),
833            Err(_) => false,
834        }
835    }
836
837    /// Query /api/v0/models and return the first loaded chat model id.
838    /// Uses /api/v0/models (not /v1/models) because the OpenAI-compat endpoint
839    /// omits the `type` field, making it impossible to distinguish embedding
840    /// models from chat models. Falls back to /v1/models with a name heuristic
841    /// if /api/v0/models is unavailable.
842    /// Returns Some("") when LM Studio is reachable but no chat model is loaded
843    /// so callers can distinguish "offline" (None) from "no chat model" (Some("")).
844    pub async fn get_loaded_model(&self) -> Option<String> {
845        #[derive(Deserialize)]
846        struct ModelList {
847            data: Vec<ModelEntry>,
848        }
849        #[derive(Deserialize)]
850        struct ModelEntry {
851            id: String,
852            #[serde(rename = "type", default)]
853            model_type: String,
854            #[serde(default)]
855            state: String,
856        }
857
858        // Try /api/v0/models first — it has type and state fields.
859        if let Ok(resp) = self
860            .client
861            .get(format!("{}/api/v0/models", self.base_url))
862            .send()
863            .await
864        {
865            if let Ok(list) = resp.json::<ModelList>().await {
866                let chat_model = list
867                    .data
868                    .into_iter()
869                    .find(|m| m.model_type != "embeddings" && m.state == "loaded")
870                    .map(|m| m.id)
871                    .unwrap_or_default();
872                return Some(chat_model);
873            }
874        }
875
876        // Fallback: /v1/models lacks type info — use name heuristic to skip embed models.
877        let resp = self
878            .client
879            .get(format!("{}/v1/models", self.base_url))
880            .send()
881            .await
882            .ok()?;
883        let list: ModelList = resp.json().await.ok()?;
884        Some(
885            list.data
886                .into_iter()
887                .find(|m| !m.id.to_lowercase().contains("embed"))
888                .map(|m| m.id)
889                .unwrap_or_default(),
890        )
891    }
892
893    /// Returns the ID of the first loaded embedding model, if any.
894    /// Uses /api/v0/models which includes `type` and `state` fields.
895    /// The OpenAI-compat /v1/models endpoint omits `type` so cannot be used here.
896    /// Accepts any non-empty state (not just "loaded") to handle LM Studio variants
897    /// where the embed model may report a different state string at startup.
898    pub async fn get_embedding_model(&self) -> Option<String> {
899        #[derive(Deserialize)]
900        struct ModelList {
901            data: Vec<ModelEntry>,
902        }
903        #[derive(Deserialize)]
904        struct ModelEntry {
905            id: String,
906            #[serde(rename = "type", default)]
907            model_type: String,
908            #[serde(default)]
909            state: String,
910        }
911        let resp = self
912            .client
913            .get(format!("{}/api/v0/models", self.base_url))
914            .send()
915            .await
916            .ok()?;
917        let list: ModelList = resp.json().await.ok()?;
918        list.data
919            .into_iter()
920            .find(|m| m.model_type == "embeddings" && m.state == "loaded")
921            .map(|m| m.id)
922    }
923
924    /// Detect the loaded model's context window size.
925    /// Tries LM Studio's `/api/v0/models` endpoint first and prefers the loaded
926    /// model's live `loaded_context_length`, then falls back to older
927    /// `context_length` / `max_context_length` style fields.
928    /// Falls back to a heuristic from the model name, then 32K.
929    pub async fn detect_context_length(&self) -> usize {
930        #[derive(Deserialize)]
931        struct LmStudioModel {
932            id: Option<String>,
933            #[serde(rename = "type", default)]
934            model_type: String,
935            state: Option<String>,
936            loaded_context_length: Option<u64>,
937            context_length: Option<u64>,
938            max_context_length: Option<u64>,
939        }
940        #[derive(Deserialize)]
941        struct LmStudioList {
942            data: Vec<LmStudioModel>,
943        }
944
945        // Check api/v0/models (LM Studio specific)
946        if let Ok(resp) = self
947            .client
948            .get(format!("{}/api/v0/models", self.base_url))
949            .send()
950            .await
951        {
952            if let Ok(list) = resp.json::<LmStudioList>().await {
953                let target_model = self.current_model().to_ascii_lowercase();
954                // Never select embedding models for context-length detection.
955                let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
956                let loaded = list
957                    .data
958                    .iter()
959                    .find(|m| {
960                        non_embed(m)
961                            && m.state.as_deref() == Some("loaded")
962                            && m.id
963                                .as_deref()
964                                .map(|id| id.eq_ignore_ascii_case(&target_model))
965                                .unwrap_or(false)
966                    })
967                    .or_else(|| {
968                        list.data
969                            .iter()
970                            .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
971                    })
972                    .or_else(|| {
973                        list.data.iter().find(|m| {
974                            non_embed(m)
975                                && m.id
976                                    .as_deref()
977                                    .map(|id| id.eq_ignore_ascii_case(&target_model))
978                                    .unwrap_or(false)
979                        })
980                    })
981                    .or_else(|| list.data.iter().find(|m| non_embed(m)));
982
983                if let Some(model) = loaded {
984                    if let Some(ctx) = model.loaded_context_length {
985                        if ctx > 0 {
986                            return ctx as usize;
987                        }
988                    }
989                    if let Some(ctx) = model.context_length {
990                        if ctx > 0 {
991                            return ctx as usize;
992                        }
993                    }
994                    if let Some(ctx) = model.max_context_length {
995                        if ctx > 0 && ctx <= 32_768 {
996                            return ctx as usize;
997                        }
998                    }
999                }
1000            }
1001        }
1002
1003        // Heuristic fallback:
1004        // If "gemma-4" is detected, we target 32,768 as the baseline standard,
1005        // acknowledging that 131,072 is available for High-Capacity tasks.
1006        if self.current_model().to_lowercase().contains("gemma-4") {
1007            return 32_768;
1008        }
1009
1010        32_768
1011    }
1012
1013    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
1014        let previous_model = self.current_model();
1015        let previous_context = self.current_context_length();
1016
1017        let detected_model = match self.get_loaded_model().await {
1018            Some(m) if !m.is_empty() => m,            // coding model found
1019            Some(_) => "no model loaded".to_string(), // reachable but no coding model
1020            None => previous_model.clone(),           // LM Studio offline
1021        };
1022
1023        if !detected_model.is_empty() && detected_model != previous_model {
1024            if let Ok(mut guard) = self.model.write() {
1025                *guard = detected_model.clone();
1026            }
1027        }
1028
1029        let detected_context = self.detect_context_length().await;
1030        let effective_model = if detected_model.is_empty() {
1031            previous_model.clone()
1032        } else {
1033            detected_model
1034        };
1035
1036        let changed = effective_model != previous_model || detected_context != previous_context;
1037        self.set_runtime_profile(&effective_model, detected_context);
1038
1039        Some((effective_model, detected_context, changed))
1040    }
1041
1042    pub fn build_system_prompt(
1043        &self,
1044        snark: u8,
1045        chaos: u8,
1046        brief: bool,
1047        professional: bool,
1048        tools: &[ToolDefinition],
1049        reasoning_history: Option<&str>,
1050        mcp_tools: &[crate::agent::mcp::McpTool],
1051    ) -> String {
1052        let mut sys = self.build_system_prompt_legacy(
1053            snark,
1054            chaos,
1055            brief,
1056            professional,
1057            tools,
1058            reasoning_history,
1059        );
1060
1061        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1062            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1063            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1064            for tool in mcp_tools {
1065                let description = tool
1066                    .description
1067                    .as_deref()
1068                    .unwrap_or("No description provided.");
1069                sys.push_str(&format!("- {}: {}\n", tool.name, description));
1070            }
1071        }
1072
1073        sys
1074    }
1075
1076    pub fn build_system_prompt_legacy(
1077        &self,
1078        snark: u8,
1079        _chaos: u8,
1080        brief: bool,
1081        professional: bool,
1082        tools: &[ToolDefinition],
1083        reasoning_history: Option<&str>,
1084    ) -> String {
1085        let current_context_length = self.current_context_length();
1086        if is_tiny_context_window(current_context_length) {
1087            return self.build_system_prompt_tiny(brief, professional);
1088        }
1089        if is_compact_context_window(current_context_length) {
1090            return self.build_system_prompt_compact(brief, professional, tools);
1091        }
1092
1093        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
1094        let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1095                                     - You are Hematite, a local coding system working on the user's machine.\n\
1096                                     - The running Hematite build is ");
1097        sys.push_str(&crate::hematite_version_display());
1098        sys.push_str(".\n\
1099                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1100                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1101                                     - For simple questions, answer briefly in plain language.\n\
1102                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1103                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1104                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1105                                     - Keep internal reasoning inside channel delimiters.\n\
1106                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1107                                     <turn|>\n\n");
1108
1109        if let Some(history) = reasoning_history {
1110            if !history.is_empty() {
1111                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1112                sys.push_str(history);
1113                sys.push_str("\n\n");
1114            }
1115        }
1116
1117        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
1118        if brief {
1119            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1120                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1121                          - Depth: Surface-level verification only.\n\n");
1122        } else {
1123            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1124                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1125                          - Depth: Full multi-step derivation required.\n\n");
1126        }
1127
1128        // IDENTITY & ENVIRONMENT
1129        let os = std::env::consts::OS;
1130        if professional {
1131            sys.push_str(&format!(
1132                "You are Hematite, a local coding system running on {}. \
1133                 The TUI is one interface layer, not your whole identity. \
1134                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1135                 Skip filler and keep the focus on the work.\n",
1136                os
1137            ));
1138        } else {
1139            sys.push_str(&format!(
1140                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1141                 The terminal UI is only one surface of the system. \
1142                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1143                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1144                self.species, snark, os
1145            ));
1146        }
1147
1148        // Inject loaded model and context window so the model knows its own budget.
1149        let current_model = self.current_model();
1150        if !current_model.is_empty() {
1151            sys.push_str(&format!(
1152                "Loaded model: {} | Context window: {} tokens. \
1153                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1154                current_model, current_context_length
1155            ));
1156            if is_gemma4_model_name(&current_model) {
1157                sys.push_str(
1158                    "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1159                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1160                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1161                );
1162            }
1163        } else {
1164            sys.push_str(&format!(
1165                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1166                current_context_length
1167            ));
1168        }
1169
1170        // PROTOCOL & TOOLS
1171        let shell_desc = if cfg!(target_os = "windows") {
1172            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1173             - Use ONLY for builds, tests, or file migrations. \n\
1174             - You MUST use the `powershell` tool directly. \n\
1175             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1176        } else {
1177            "[EXTERNAL SHELL]: `bash` (Unix).\n\
1178             - Use ONLY for builds, tests, or file migrations. \n\
1179             - NEVER wrap bash in other shells. \n\n"
1180        };
1181
1182        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1183                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1184                      - These are the ONLY way to explore and modify code. \n\
1185                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1186        sys.push_str(shell_desc);
1187
1188        // ANTI-LOOPING & SELF-AUDIT
1189        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1190                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1191
1192        if brief {
1193            sys.push_str(
1194                "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1195            );
1196        }
1197
1198        sys.push_str("## CORE DIRECTIVES\n\
1199                       1. REASONING: Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1200                       2. CONCISENESS: After <think>, output ONE concise technical sentence or code block. Nothing else.\n\
1201                       3. GROUNDEDNESS: Never invent tools, channels, or files. If a detail is not verified from tool output, say `uncertain`. Answer from stable Hematite capabilities unless repo implementation is requested.\n\n\
1202                       ## ARCHITECTURAL DISCIPLINE\n\
1203                       - HOST INSPECTION PRIORITY: MANDATORY. For all diagnostic questions (load, CPU/RAM, processes, toolchains, network, ports, OS config, log-checks), prefer `inspect_host` over raw `shell`. If `env_doctor` answers, do not follow with `path` unless requested.\n\
1204                       - WORKFLOW PRIORITY: Prefer `run_workspace_workflow` for project builds/tests and `run_hematite_maintainer_workflow` for app-level scripts over raw `shell`.\n\
1205                       - PROOF BEFORE ACTION: `read_file` or `inspect_lines` before editing. For files >200 lines, `grep_files` for a pattern BEFORE reading.\n\
1206                       - PROOF BEFORE COMMIT: Do not `git_commit` until a successful `verify_build` exists for the latest changes.\n\
1207                       - BUILT-IN FIRST: Prefer internal file tools over `mcp__filesystem__*` unless MCP is explicitly required.\n\n\
1208                       ## TECHNIQUE & SAFETY\n\
1209                       - SHELL DISCIPLINE: Risky `shell` calls REQUIRE a `reason` argument. Always use `powershell` on Windows; never `bash` or `/dev/null`.\n\
1210                       - EDIT & TOOL PRECISION: Use unique lines/anchors for `edit_file`. Do not call tools like 'think' or 'reasoning'. Paths and symbols are NOT tool names.\n\
1211                       - CONTEXT AWARENESS: Answer at the harness level (file ops, shell, build). Prefer real language examples (Python, C#, TS, Go). Never mention `mcp__*` tools unless active and relevant.\n\
1212                       - TOOLING DISCIPLINE: Prefer `describe_toolchain` over improvising tool surface from memory; preserve its identifiers exactly.");
1213
1214        // Scaffolding protocol — enforces build validation after project creation.
1215        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1216            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1217            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1218            4. Fix all errors before declaring success.\n\n\
1219            ## PRE-FLIGHT SCOPING PROTOCOL\n\
1220            Before attempting any multi-file task or complex refactor:\n\
1221            1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1222            2. Use `auto_pin_context` to keep those files in active context.\n\
1223            3. Only then proceed to deeper edits or research.\n\n\
1224            ## REFACTORING PROTOCOL\n\
1225            When modifying existing code or renaming symbols:\n\
1226            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1227            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1228            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1229
1230        // Inject CLAUDE.md / instruction files from the project directory.
1231        sys.push_str(&load_instruction_files());
1232
1233        // Inject cross-session memories synthesized by DeepReflect.
1234        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1235
1236        // Native Gemma-4 Tool Declarations
1237        if !tools.is_empty() {
1238            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1239            for tool in tools {
1240                let schema = serde_json::to_string(&tool.function.parameters)
1241                    .unwrap_or_else(|_| "{}".to_string());
1242                sys.push_str(&format!(
1243                    "<|tool>declaration:{}{}{}<tool|>\n",
1244                    tool.function.name, "{", schema
1245                ));
1246                sys.push_str(&format!("// {})\n", tool.function.description));
1247            }
1248        }
1249
1250        sys
1251    }
1252
1253    fn build_system_prompt_compact(
1254        &self,
1255        brief: bool,
1256        professional: bool,
1257        tools: &[ToolDefinition],
1258    ) -> String {
1259        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
1260        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
1261        let current_model = self.current_model();
1262        let current_context_length = self.current_context_length();
1263        let os = std::env::consts::OS;
1264
1265        let mut sys = String::from("<|turn>system\n<|think|>\n");
1266        sys.push_str(&format!(
1267            "You are Hematite {}, a local coding harness working on the user's machine.\n",
1268            crate::hematite_version_display()
1269        ));
1270        if professional {
1271            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1272        } else {
1273            sys.push_str(&format!(
1274                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1275                self.species
1276            ));
1277        }
1278        sys.push_str(&format!(
1279            "Model: {} | Context: {} tokens. Keep turns focused.\n",
1280            current_model, current_context_length
1281        ));
1282        if is_gemma4_model_name(&current_model) {
1283            sys.push_str(
1284                "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1285                 Raw regex patterns in grep_files, no slash delimiters.\n",
1286            );
1287        }
1288        if cfg!(target_os = "windows") {
1289            sys.push_str(&format!(
1290                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1291                os
1292            ));
1293        } else {
1294            sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1295        }
1296        if brief {
1297            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1298        }
1299
1300        sys.push_str(
1301            "\nCORE RULES:\n\
1302             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1303             - Verify after edits: run `verify_build` after code changes, before committing.\n\
1304             - One tool at a time. Do not batch unrelated tool calls.\n\
1305             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1306             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1307             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1308        );
1309
1310        if !tools.is_empty() {
1311            sys.push_str("\n# AVAILABLE TOOLS\n");
1312            for tool in tools {
1313                let desc: String = tool.function.description.chars().take(120).collect();
1314                sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1315            }
1316        }
1317
1318        sys.push_str("<turn|>\n");
1319        sys
1320    }
1321
1322    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1323        let current_model = self.current_model();
1324        let current_context_length = self.current_context_length();
1325        let os = std::env::consts::OS;
1326        let mut sys = format!(
1327            "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1328            crate::hematite_version_display()
1329        );
1330        if professional {
1331            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1332        } else {
1333            sys.push_str(&format!(
1334                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1335                self.species
1336            ));
1337        }
1338        if !current_model.is_empty() {
1339            sys.push_str(&format!(
1340                "Loaded model: {} | Context window: {} tokens.\n",
1341                current_model, current_context_length
1342            ));
1343        } else {
1344            sys.push_str(&format!(
1345                "Context window: {} tokens.\n",
1346                current_context_length
1347            ));
1348        }
1349        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1350        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1351        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1352        if cfg!(target_os = "windows") {
1353            sys.push_str(&format!(
1354                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1355                os
1356            ));
1357        } else {
1358            sys.push_str(&format!(
1359                "You are running on {}. Use the native Unix shell conventions.\n",
1360                os
1361            ));
1362        }
1363        if brief {
1364            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1365        }
1366        if is_gemma4_model_name(&current_model) {
1367            sys.push_str(
1368                "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1369            );
1370        }
1371        sys.push_str("<turn|>\n");
1372        sys
1373    }
1374
1375    // ── Non-streaming call (used for agentic turns with tool support) ─────────
1376
1377    /// Send messages to the model. Returns (text_content, tool_calls).
1378    /// Exactly one of the two will be Some on a successful response.
1379    pub async fn call_with_tools(
1380        &self,
1381        messages: &[ChatMessage],
1382        tools: &[ToolDefinition],
1383        // Override the model ID for this call. None = use the live runtime model.
1384        model_override: Option<&str>,
1385    ) -> Result<
1386        (
1387            Option<String>,
1388            Option<Vec<ToolCallResponse>>,
1389            Option<TokenUsage>,
1390            Option<String>,
1391        ),
1392        String,
1393    > {
1394        let _permit = self
1395            .kv_semaphore
1396            .acquire()
1397            .await
1398            .map_err(|e| e.to_string())?;
1399
1400        let current_model = self.current_model();
1401        let model = model_override.unwrap_or(current_model.as_str()).to_string();
1402        let filtered_tools = if cfg!(target_os = "windows") {
1403            tools
1404                .iter()
1405                .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1406                .cloned()
1407                .collect::<Vec<_>>()
1408        } else {
1409            tools.to_vec()
1410        };
1411
1412        let request_messages = if should_use_gemma_native_formatting(self, &model) {
1413            prepare_gemma_native_messages(messages)
1414        } else {
1415            messages.to_vec()
1416        };
1417
1418        // In compact context windows, restrict tools to the core coding set.
1419        // Full schemas for 36+ tools add 10k+ tokens via the model's chat template (e.g. Gemma 4).
1420        // Sending a small core set keeps schemas available for structured tool-call dispatch
1421        // while staying within the 16k budget.
1422        const COMPACT_CORE_TOOLS: &[&str] = &[
1423            "read_file",
1424            "inspect_lines",
1425            "edit_file",
1426            "write_file",
1427            "grep_files",
1428            "list_files",
1429            "verify_build",
1430            "shell",
1431        ];
1432        let effective_tools = if is_compact_context_window(self.current_context_length()) {
1433            let core: Vec<_> = filtered_tools
1434                .iter()
1435                .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1436                .cloned()
1437                .collect();
1438            if core.is_empty() {
1439                None
1440            } else {
1441                Some(core)
1442            }
1443        } else if filtered_tools.is_empty() {
1444            None
1445        } else {
1446            Some(filtered_tools)
1447        };
1448
1449        let request = ChatRequest {
1450            model: model.clone(),
1451            messages: request_messages,
1452            temperature: 0.2,
1453            stream: false,
1454            tools: effective_tools,
1455        };
1456
1457        // Exponential backoff: retry up to 3× on 5xx / timeout / connect errors.
1458        preflight_chat_request(
1459            &model,
1460            &request.messages,
1461            request.tools.as_deref().unwrap_or(&[]),
1462            self.current_context_length(),
1463        )?;
1464
1465        let mut last_err = String::new();
1466        let mut response_opt: Option<reqwest::Response> = None;
1467        for attempt in 0..3u32 {
1468            match self.client.post(&self.api_url).json(&request).send().await {
1469                Ok(res) if res.status().is_success() => {
1470                    response_opt = Some(res);
1471                    break;
1472                }
1473                Ok(res) if res.status().as_u16() >= 500 => {
1474                    last_err = format!("LM Studio error {}", res.status());
1475                }
1476                Ok(res) => {
1477                    // 4xx — don't retry
1478                    let status = res.status();
1479                    let body = res.text().await.unwrap_or_default();
1480                    let preview = &body[..body.len().min(300)];
1481                    return Err(format!("LM Studio error {}: {}", status, preview));
1482                }
1483                Err(e) if e.is_timeout() || e.is_connect() => {
1484                    last_err = format!("Request failed: {}", e);
1485                }
1486                Err(e) => return Err(format!("Request failed: {}", e)),
1487            }
1488            if attempt < 2 {
1489                let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1490                tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1491            }
1492        }
1493        let res = response_opt
1494            .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1495
1496        let body: ChatResponse = res
1497            .json()
1498            .await
1499            .map_err(|e| format!("Response parse error: {}", e))?;
1500
1501        if let Some(usage) = &body.usage {
1502            let mut econ = self.economics.lock().unwrap();
1503            econ.input_tokens += usage.prompt_tokens;
1504            econ.output_tokens += usage.completion_tokens;
1505        }
1506
1507        let choice = body
1508            .choices
1509            .into_iter()
1510            .next()
1511            .ok_or_else(|| "Empty response from model".to_string())?;
1512
1513        let finish_reason = choice.finish_reason;
1514        let mut tool_calls = choice.message.tool_calls;
1515        let mut content = choice.message.content;
1516
1517        // Gemma-4 Fallback: If the model outputs native <|tool_call|> tags in the text content,
1518        // extract them and treat them as valid tool calls.
1519        if let Some(raw_content) = &content {
1520            let native_calls = extract_native_tool_calls(raw_content);
1521            if !native_calls.is_empty() {
1522                let mut existing = tool_calls.unwrap_or_default();
1523                existing.extend(native_calls);
1524                tool_calls = Some(existing);
1525                let stripped = strip_native_tool_call_text(raw_content);
1526                content = if stripped.trim().is_empty() {
1527                    None
1528                } else {
1529                    Some(stripped)
1530                };
1531            }
1532        }
1533
1534        if is_gemma4_model_name(&model) {
1535            if let Some(calls) = tool_calls.as_mut() {
1536                for call in calls.iter_mut() {
1537                    call.function.arguments = normalize_tool_argument_string(
1538                        &call.function.name,
1539                        &call.function.arguments,
1540                    );
1541                }
1542            }
1543        }
1544
1545        // Qwen3 Fallback: When the model generates tool calls inside a <think> block,
1546        // LM Studio routes the entire thinking output (including <tool_call> XML) to
1547        // `reasoning_content` instead of `tool_calls`. If content is empty and we have
1548        // no tool calls yet, check reasoning_content for embedded tool call markup.
1549        let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1550        if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1551            && content
1552                .as_ref()
1553                .map(|s| s.trim().is_empty())
1554                .unwrap_or(true)
1555            && !reasoning_text.is_empty()
1556        {
1557            let recovered = extract_native_tool_calls(&reasoning_text);
1558            if !recovered.is_empty() {
1559                tool_calls = Some(recovered);
1560                // Clear content so downstream code doesn't see an empty string.
1561                content = None;
1562            } else if finish_reason.as_deref() == Some("stop") {
1563                // Qwen3.5 thinking-mode answer leak: the model wrote its final answer
1564                // inside reasoning_content and left content empty. finish_reason=stop
1565                // with no tool calls means the model intended this as its response.
1566                // Surface reasoning_content as the visible response rather than
1567                // firing the empty-response nudge loop.
1568                content = Some(reasoning_text);
1569            }
1570        }
1571
1572        Ok((content, tool_calls, body.usage, finish_reason))
1573    }
1574
1575    // ── Streaming call (used for plain-text responses) ────────────────────────
1576
1577    /// Stream a conversation (no tools). Emits Token/Done/Error events.
1578    pub async fn stream_messages(
1579        &self,
1580        messages: &[ChatMessage],
1581        tx: mpsc::Sender<InferenceEvent>,
1582    ) -> Result<(), Box<dyn std::error::Error>> {
1583        let current_model = self.current_model();
1584        let request_messages = if should_use_gemma_native_formatting(self, &current_model) {
1585            prepare_gemma_native_messages(messages)
1586        } else {
1587            messages
1588                .iter()
1589                .map(|m| {
1590                    let mut clone = m.clone();
1591                    let current_text = m.content.as_str();
1592                    if !current_text.starts_with("<|turn>") {
1593                        clone.content = MessageContent::Text(format!(
1594                            "<|turn>{}\n{}\n<turn|>",
1595                            m.role, current_text
1596                        ));
1597                    }
1598                    clone
1599                })
1600                .collect()
1601        };
1602
1603        let request = ChatRequest {
1604            model: current_model.clone(),
1605            messages: request_messages,
1606            temperature: 0.7,
1607            stream: true,
1608            tools: None,
1609        };
1610
1611        if let Err(e) = preflight_chat_request(
1612            &current_model,
1613            &request.messages,
1614            &[],
1615            self.current_context_length(),
1616        ) {
1617            let tag = classify_runtime_failure_tag(&e);
1618            let _ = tx
1619                .send(InferenceEvent::ProviderStatus {
1620                    state: provider_state_for_failure_tag(tag),
1621                    summary: compact_runtime_failure_summary(tag, &e),
1622                })
1623                .await;
1624            let _ = tx
1625                .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1626                .await;
1627            let _ = tx.send(InferenceEvent::Done).await;
1628            return Ok(());
1629        }
1630
1631        let mut last_err = String::new();
1632        let mut response_opt: Option<reqwest::Response> = None;
1633        for attempt in 0..2u32 {
1634            match self.client.post(&self.api_url).json(&request).send().await {
1635                Ok(res) if res.status().is_success() => {
1636                    response_opt = Some(res);
1637                    break;
1638                }
1639                Ok(res) if res.status().as_u16() >= 500 => {
1640                    last_err = format!("LM Studio error {}", res.status());
1641                }
1642                Ok(res) => {
1643                    let status = res.status();
1644                    let body = res.text().await.unwrap_or_default();
1645                    let preview = &body[..body.len().min(300)];
1646                    let detail = format!("LM Studio error {}: {}", status, preview);
1647                    let tag = classify_runtime_failure_tag(&detail);
1648                    let _ = tx
1649                        .send(InferenceEvent::ProviderStatus {
1650                            state: provider_state_for_failure_tag(tag),
1651                            summary: compact_runtime_failure_summary(tag, &detail),
1652                        })
1653                        .await;
1654                    let _ = tx
1655                        .send(InferenceEvent::Error(format_runtime_failure_message(
1656                            &detail,
1657                        )))
1658                        .await;
1659                    let _ = tx.send(InferenceEvent::Done).await;
1660                    return Ok(());
1661                }
1662                Err(e) if e.is_timeout() || e.is_connect() => {
1663                    last_err = format!("Request failed: {}", e);
1664                }
1665                Err(e) => {
1666                    let detail = format!("Request failed: {}", e);
1667                    let tag = classify_runtime_failure_tag(&detail);
1668                    let _ = tx
1669                        .send(InferenceEvent::ProviderStatus {
1670                            state: provider_state_for_failure_tag(tag),
1671                            summary: compact_runtime_failure_summary(tag, &detail),
1672                        })
1673                        .await;
1674                    let _ = tx
1675                        .send(InferenceEvent::Error(format_runtime_failure_message(
1676                            &detail,
1677                        )))
1678                        .await;
1679                    let _ = tx.send(InferenceEvent::Done).await;
1680                    return Ok(());
1681                }
1682            }
1683            if attempt < 1 {
1684                let _ = tx
1685                    .send(InferenceEvent::ProviderStatus {
1686                        state: ProviderRuntimeState::Recovering,
1687                        summary: "LM Studio degraded during stream startup; retrying once.".into(),
1688                    })
1689                    .await;
1690                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1691            }
1692        }
1693        let Some(res) = response_opt else {
1694            let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1695            let tag = classify_runtime_failure_tag(&detail);
1696            let _ = tx
1697                .send(InferenceEvent::ProviderStatus {
1698                    state: provider_state_for_failure_tag(tag),
1699                    summary: compact_runtime_failure_summary(tag, &detail),
1700                })
1701                .await;
1702            let _ = tx
1703                .send(InferenceEvent::Error(format_runtime_failure_message(
1704                    &detail,
1705                )))
1706                .await;
1707            let _ = tx.send(InferenceEvent::Done).await;
1708            return Ok(());
1709        };
1710
1711        use futures::StreamExt;
1712        let mut byte_stream = res.bytes_stream();
1713
1714        // [Collaborative Strategy] TokenBuffer refactor suggested by Hematite local agent.
1715        // Aggregates tokens to ensure coherent linguistic chunks for UI/Voice.
1716        let mut line_buffer = String::new();
1717        let mut content_buffer = String::new();
1718        let mut past_think = false;
1719        let mut emitted_any_content = false;
1720        let mut emitted_live_status = false;
1721
1722        // Immediate cancel gate: break *before* awaiting the stream
1723        // so Escape works even when LM Studio is silent between chunks.
1724        loop {
1725            let next = tokio::select! {
1726                // Race: next SSE chunk vs cancel poll
1727                chunk = byte_stream.next() => chunk,
1728                _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1729                    if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1730                        break;
1731                    }
1732                    continue;
1733                }
1734            };
1735
1736            let Some(item) = next else { break };
1737
1738            let chunk = match item {
1739                Ok(chunk) => chunk,
1740                Err(e) => {
1741                    let detail = format!("Request failed: {}", e);
1742                    let tag = classify_runtime_failure_tag(&detail);
1743                    let _ = tx
1744                        .send(InferenceEvent::ProviderStatus {
1745                            state: provider_state_for_failure_tag(tag),
1746                            summary: compact_runtime_failure_summary(tag, &detail),
1747                        })
1748                        .await;
1749                    let _ = tx
1750                        .send(InferenceEvent::Error(format_runtime_failure_message(
1751                            &detail,
1752                        )))
1753                        .await;
1754                    let _ = tx.send(InferenceEvent::Done).await;
1755                    return Ok(());
1756                }
1757            };
1758            line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1759
1760            while let Some(pos) = line_buffer.find("\n\n") {
1761                let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1762                let data_pos = match event_str.find("data: ") {
1763                    Some(p) => p,
1764                    None => continue,
1765                };
1766
1767                let data = event_str[data_pos + 6..].trim();
1768                if data == "[DONE]" {
1769                    break;
1770                }
1771
1772                if let Ok(json) = serde_json::from_str::<Value>(data) {
1773                    let delta = &json["choices"][0]["delta"];
1774
1775                    // Process reasoning/thought deltas (Qwen/O1 style)
1776                    if let Some(reasoning) = delta["reasoning_content"]
1777                        .as_str()
1778                        .or_else(|| delta["thought"].as_str())
1779                    {
1780                        if !reasoning.is_empty() {
1781                            past_think = false; // We are in reasoning mode
1782                            content_buffer.push_str(reasoning);
1783                            if content_buffer.len() > 30
1784                                && (reasoning.contains('\n') || reasoning.contains('.'))
1785                            {
1786                                let _ = tx
1787                                    .send(InferenceEvent::Thought(content_buffer.clone()))
1788                                    .await;
1789                                emitted_any_content = true;
1790                                content_buffer.clear();
1791                            }
1792                        }
1793                    }
1794
1795                    // Process standard content deltas
1796                    if let Some(content) = delta["content"].as_str() {
1797                        if content.is_empty() {
1798                            continue;
1799                        }
1800
1801                        // Auto-transition: if we have content but were in 'thinking' mode,
1802                        // and haven't seen an explicit tag, assume reasoning is over once content is non-empty.
1803                        if !past_think && !content_buffer.is_empty() && !content.trim().is_empty() {
1804                            // Only transition if the content isn't logically part of the thinking
1805                            // block (some models mix them). Standard heuristic: first non-whitespace content.
1806                            let _ = tx
1807                                .send(InferenceEvent::Thought(content_buffer.clone()))
1808                                .await;
1809                            content_buffer.clear();
1810                            past_think = true;
1811                        }
1812
1813                        if !past_think {
1814                            let lc = content.to_lowercase();
1815                            let close = lc
1816                                .find("<channel|>")
1817                                .map(|i| (i, "<channel|>".len()))
1818                                .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1819
1820                            if let Some((tag_start, tag_len)) = close {
1821                                // Flush any existing thought buffer
1822                                let before = &content[..tag_start];
1823                                content_buffer.push_str(before);
1824                                if !content_buffer.trim().is_empty() {
1825                                    let _ = tx
1826                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1827                                        .await;
1828                                    emitted_any_content = true;
1829                                }
1830                                content_buffer.clear();
1831
1832                                past_think = true;
1833                                let after = content[tag_start + tag_len..].trim_start_matches('\n');
1834                                content_buffer.push_str(after);
1835                            } else {
1836                                // Still in reasoning block
1837                                content_buffer.push_str(content);
1838                                if content_buffer.len() > 30
1839                                    && (content.contains('\n') || content.contains('.'))
1840                                {
1841                                    let _ = tx
1842                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1843                                        .await;
1844                                    emitted_any_content = true;
1845                                    content_buffer.clear();
1846                                }
1847                            }
1848                        } else {
1849                            // PAST THINK: final answer tokens.
1850                            content_buffer.push_str(content);
1851                            let is_boundary = content.contains(' ')
1852                                || content.contains('.')
1853                                || content.contains('!')
1854                                || content.contains('?');
1855
1856                            if content_buffer.len() > 10 && is_boundary {
1857                                if !emitted_live_status {
1858                                    let _ = tx
1859                                        .send(InferenceEvent::ProviderStatus {
1860                                            state: ProviderRuntimeState::Live,
1861                                            summary: String::new(),
1862                                        })
1863                                        .await;
1864                                    emitted_live_status = true;
1865                                }
1866                                let _ =
1867                                    tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1868                                emitted_any_content = true;
1869                                content_buffer.clear();
1870                            }
1871                        }
1872                    }
1873                }
1874            }
1875        }
1876
1877        // Final Flush
1878        if !content_buffer.is_empty() {
1879            if past_think {
1880                if !emitted_live_status {
1881                    let _ = tx
1882                        .send(InferenceEvent::ProviderStatus {
1883                            state: ProviderRuntimeState::Live,
1884                            summary: String::new(),
1885                        })
1886                        .await;
1887                }
1888                let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1889            } else {
1890                let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1891            }
1892            emitted_any_content = true;
1893        }
1894
1895        if !emitted_any_content {
1896            let _ = tx
1897                .send(InferenceEvent::ProviderStatus {
1898                    state: ProviderRuntimeState::EmptyResponse,
1899                    summary: compact_runtime_failure_summary(
1900                        "empty_model_response",
1901                        "Empty response from model",
1902                    ),
1903                })
1904                .await;
1905            let _ = tx
1906                .send(InferenceEvent::Error(format_runtime_failure_message(
1907                    "Empty response from model",
1908                )))
1909                .await;
1910            let _ = tx.send(InferenceEvent::Done).await;
1911            return Ok(());
1912        }
1913
1914        let _ = tx.send(InferenceEvent::Done).await;
1915        Ok(())
1916    }
1917
1918    /// Single-turn streaming (legacy helper used by startup sequence).
1919    pub async fn stream_generation(
1920        &self,
1921        prompt: &str,
1922        snark: u8,
1923        chaos: u8,
1924        brief: bool,
1925        professional: bool,
1926        tx: mpsc::Sender<InferenceEvent>,
1927    ) -> Result<(), Box<dyn std::error::Error>> {
1928        let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1929        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1930        self.stream_messages(&messages, tx).await
1931    }
1932
1933    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
1934
1935    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
1936    pub async fn generate_task_worker(
1937        &self,
1938        prompt: &str,
1939        professional: bool,
1940    ) -> Result<String, String> {
1941        let current_model = self.current_model();
1942        let model = self
1943            .worker_model
1944            .as_deref()
1945            .unwrap_or(current_model.as_str());
1946        self.generate_task_with_model(prompt, 0.1, professional, model)
1947            .await
1948    }
1949
1950    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1951        self.generate_task_with_temp(prompt, 0.1, professional)
1952            .await
1953    }
1954
1955    pub async fn generate_task_with_temp(
1956        &self,
1957        prompt: &str,
1958        temp: f32,
1959        professional: bool,
1960    ) -> Result<String, String> {
1961        let current_model = self.current_model();
1962        self.generate_task_with_model(prompt, temp, professional, &current_model)
1963            .await
1964    }
1965
1966    pub async fn generate_task_with_model(
1967        &self,
1968        prompt: &str,
1969        temp: f32,
1970        professional: bool,
1971        model: &str,
1972    ) -> Result<String, String> {
1973        let _permit = self
1974            .kv_semaphore
1975            .acquire()
1976            .await
1977            .map_err(|e| e.to_string())?;
1978
1979        let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1980        let request_messages = if should_use_gemma_native_formatting(self, model) {
1981            prepare_gemma_native_messages(&[
1982                ChatMessage::system(&system),
1983                ChatMessage::user(prompt),
1984            ])
1985        } else {
1986            vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1987        };
1988        let request = ChatRequest {
1989            model: model.to_string(),
1990            messages: request_messages,
1991            temperature: temp,
1992            stream: false,
1993            tools: None,
1994        };
1995
1996        preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1997
1998        let res = self
1999            .client
2000            .post(&self.api_url)
2001            .json(&request)
2002            .send()
2003            .await
2004            .map_err(|e| format!("LM Studio request failed: {}", e))?;
2005
2006        let body: ChatResponse = res
2007            .json()
2008            .await
2009            .map_err(|e| format!("Failed to parse response: {}", e))?;
2010
2011        body.choices
2012            .first()
2013            .and_then(|c| c.message.content.clone())
2014            .ok_or_else(|| "Empty response from model".to_string())
2015    }
2016
2017    // ── History management ────────────────────────────────────────────────────
2018
2019    /// Prune middle turns when context grows too large, keeping system + recent N.
2020    #[allow(dead_code)]
2021    pub fn snip_history(
2022        &self,
2023        turns: &[ChatMessage],
2024        max_tokens_estimate: usize,
2025        keep_recent: usize,
2026    ) -> Vec<ChatMessage> {
2027        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
2028        if total_chars / 4 <= max_tokens_estimate {
2029            return turns.to_vec();
2030        }
2031        let keep = keep_recent.min(turns.len());
2032        let mut snipped = vec![turns[0].clone()];
2033        if turns.len() > keep + 1 {
2034            snipped.push(ChatMessage::system(&format!(
2035                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2036                turns.len() - keep - 1
2037            )));
2038            snipped.extend_from_slice(&turns[turns.len() - keep..]);
2039        } else {
2040            snipped = turns.to_vec();
2041        }
2042        snipped
2043    }
2044}
2045
2046fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2047    serde_json::to_vec(value)
2048        .ok()
2049        .map_or(0, |bytes| bytes.len() / 4 + 1)
2050}
2051
2052const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2053
2054fn estimate_message_tokens(message: &ChatMessage) -> usize {
2055    let content_tokens = match &message.content {
2056        MessageContent::Text(s) => s.len() / 4 + 1,
2057        MessageContent::Parts(parts) => parts
2058            .iter()
2059            .map(|part| match part {
2060                ContentPart::Text { text } => text.len() / 4 + 1,
2061                // Image payloads are transported as data URLs, but their base64
2062                // length should not be treated like plain text context pressure.
2063                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2064            })
2065            .sum(),
2066    };
2067    let tool_tokens: usize = message
2068        .tool_calls
2069        .iter()
2070        .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2071        .sum();
2072    content_tokens + tool_tokens + 6
2073}
2074
2075pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2076    messages.iter().map(estimate_message_tokens).sum()
2077}
2078
2079fn reserved_output_tokens(context_length: usize) -> usize {
2080    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2081    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2082}
2083
2084pub fn estimate_prompt_pressure(
2085    messages: &[ChatMessage],
2086    tools: &[ToolDefinition],
2087    context_length: usize,
2088) -> (usize, usize, usize, u8) {
2089    let estimated_input_tokens =
2090        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2091    let reserved_output = reserved_output_tokens(context_length);
2092    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2093    let percent = if context_length == 0 {
2094        0
2095    } else {
2096        ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2097    };
2098    (
2099        estimated_input_tokens,
2100        reserved_output,
2101        estimated_total,
2102        percent,
2103    )
2104}
2105
2106fn preflight_chat_request(
2107    model: &str,
2108    messages: &[ChatMessage],
2109    tools: &[ToolDefinition],
2110    context_length: usize,
2111) -> Result<(), String> {
2112    let (estimated_input_tokens, reserved_output, estimated_total, _) =
2113        estimate_prompt_pressure(messages, tools, context_length);
2114
2115    if estimated_total > context_length {
2116        return Err(format!(
2117            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2118            model, estimated_input_tokens, reserved_output, estimated_total, context_length
2119        ));
2120    }
2121
2122    Ok(())
2123}
2124
2125/// Walk from CWD up to 4 parent directories and collect instruction files.
2126/// Looks for CLAUDE.md, CLAUDE.local.md, and .hematite/instructions.md.
2127/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
2128fn load_instruction_files() -> String {
2129    use std::collections::hash_map::DefaultHasher;
2130    use std::collections::HashSet;
2131    use std::hash::{Hash, Hasher};
2132
2133    let Ok(cwd) = std::env::current_dir() else {
2134        return String::new();
2135    };
2136    let mut result = String::new();
2137    let mut seen: HashSet<u64> = HashSet::new();
2138    let mut total_chars: usize = 0;
2139    const MAX_TOTAL: usize = 12_000;
2140    const MAX_PER_FILE: usize = 4_000;
2141
2142    let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2143
2144    let mut dir = cwd.clone();
2145    for _ in 0..4 {
2146        for name in &candidates {
2147            let path = dir.join(name);
2148            if !path.exists() {
2149                continue;
2150            }
2151            let Ok(content) = std::fs::read_to_string(&path) else {
2152                continue;
2153            };
2154            if content.trim().is_empty() {
2155                continue;
2156            }
2157
2158            let mut hasher = DefaultHasher::new();
2159            content.hash(&mut hasher);
2160            let h = hasher.finish();
2161            if !seen.insert(h) {
2162                continue;
2163            }
2164
2165            let truncated = if content.len() > MAX_PER_FILE {
2166                format!("{}...[truncated]", &content[..MAX_PER_FILE])
2167            } else {
2168                content
2169            };
2170
2171            if total_chars + truncated.len() > MAX_TOTAL {
2172                break;
2173            }
2174            total_chars += truncated.len();
2175            result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2176        }
2177        match dir.parent().map(|p| p.to_owned()) {
2178            Some(p) => dir = p,
2179            None => break,
2180        }
2181    }
2182
2183    if result.is_empty() {
2184        return String::new();
2185    }
2186    format!("\n\n# Project Instructions\n{}", result)
2187}
2188
2189pub fn extract_think_block(text: &str) -> Option<String> {
2190    let lower = text.to_lowercase();
2191
2192    // Official Gemma-4 Native Tags
2193    let open_tag = "<|channel>thought";
2194    let close_tag = "<channel|>";
2195
2196    let start_pos = lower.find(open_tag)?;
2197    let content_start = start_pos + open_tag.len();
2198
2199    let close_pos = lower[content_start..]
2200        .find(close_tag)
2201        .map(|p| content_start + p)
2202        .unwrap_or(text.len());
2203
2204    let content = text[content_start..close_pos].trim();
2205    if content.is_empty() {
2206        None
2207    } else {
2208        Some(content.to_string())
2209    }
2210}
2211
2212pub fn strip_think_blocks(text: &str) -> String {
2213    // Fast-path: strip a stray </think> the model emits at the start when it skips
2214    // the opening tag (common with Qwen after tool calls). Strip it before the lower
2215    // allocation so it can't slip through any branch below.
2216    let text = {
2217        let t = text.trim_start();
2218        if t.to_lowercase().starts_with("</think>") {
2219            &t[8..]
2220        } else {
2221            text
2222        }
2223    };
2224
2225    let lower = text.to_lowercase();
2226
2227    // Use the official Gemma-4 closing tag — answer is everything after it.
2228    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2229        let answer = text[end..]
2230            .replace("<|channel>thought", "")
2231            .replace("<channel|>", "");
2232        return answer.trim().replace("\n\n\n", "\n\n").to_string();
2233    }
2234
2235    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
2236    let first_open = [
2237        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
2238        lower.find("<think>"),
2239        lower.find("<thought>"),
2240        lower.find("<|think|>"),
2241    ]
2242    .iter()
2243    .filter_map(|&x| x)
2244    .min();
2245
2246    if let Some(start) = first_open {
2247        if start > 0 {
2248            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2249        }
2250        return String::new();
2251    }
2252
2253    // If the model outputs 'naked' reasoning without tags:
2254    // Strip leading sentences like "The user asked..." or "I should present..."
2255    // if they appear before actual answer content.
2256    let naked_reasoning_phrases: &[&str] = &[
2257        "the user asked",
2258        "the user is asking",
2259        "the user wants",
2260        "i will structure",
2261        "i should provide",
2262        "i should give",
2263        "i should avoid",
2264        "i should note",
2265        "i should focus",
2266        "i should keep",
2267        "i should respond",
2268        "i should present",
2269        "i should display",
2270        "i should show",
2271        "i need to",
2272        "i can see from",
2273        "without being overly",
2274        "let me ",
2275        "necessary information in my identity",
2276        "was computed successfully",
2277        "computed successfully",
2278    ];
2279    let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2280    if is_naked_reasoning {
2281        let lines: Vec<&str> = text.lines().collect();
2282        if !lines.is_empty() {
2283            // Skip leading lines that are themselves reasoning prose or blank.
2284            // Stop skipping at the first line that looks like real answer content.
2285            let mut start_idx = 0;
2286            for (i, line) in lines.iter().enumerate() {
2287                let l = line.to_lowercase();
2288                let is_reasoning_line =
2289                    naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2290                if is_reasoning_line {
2291                    start_idx = i + 1;
2292                } else {
2293                    break;
2294                }
2295            }
2296            if start_idx < lines.len() {
2297                return lines[start_idx..]
2298                    .join("\n")
2299                    .trim()
2300                    .replace("\n\n\n", "\n\n")
2301                    .to_string();
2302            }
2303            // Entire response was reasoning prose — return empty.
2304            return String::new();
2305        }
2306    }
2307
2308    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
2309    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
2310    let cleaned = strip_xml_tool_call_artifacts(text);
2311    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2312}
2313
2314/// Remove stray XML tool-call closing/opening tags that local models occasionally
2315/// leak into visible output when they start-then-abandon a tool call.
2316fn strip_xml_tool_call_artifacts(text: &str) -> String {
2317    // Tags to remove (both open and close forms, case-insensitive).
2318    const XML_ARTIFACTS: &[&str] = &[
2319        "</tool_call>",
2320        "<tool_call>",
2321        "</function>",
2322        "<function>",
2323        "</parameter>",
2324        "<parameter>",
2325        "</arguments>",
2326        "<arguments>",
2327        "</tool_use>",
2328        "<tool_use>",
2329        "</invoke>",
2330        "<invoke>",
2331        // Stray think/reasoning closing tags that leak after block extraction.
2332        "</think>",
2333        "</thought>",
2334        "</thinking>",
2335    ];
2336    let mut out = text.to_string();
2337    for tag in XML_ARTIFACTS {
2338        // Case-insensitive replace
2339        while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2340            out.drain(pos..pos + tag.len());
2341        }
2342    }
2343    // Collapse any blank lines left behind
2344    out
2345}
2346
2347/// Extract native Gemma-4 <|tool_call|> tags from text.
2348/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
2349pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2350    use regex::Regex;
2351    let mut results = Vec::new();
2352
2353    // -- Format 1: Gemma 4 Native (call:name{args}) --
2354    let re_call = Regex::new(
2355        r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2356    ).unwrap();
2357    let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2358
2359    for cap in re_call.captures_iter(text) {
2360        let name = cap[1].to_string();
2361        let args_str = &cap[2];
2362        let mut arguments = serde_json::Map::new();
2363
2364        for arg_cap in re_arg.captures_iter(args_str) {
2365            let key = arg_cap[1].to_string();
2366            let val_raw = arg_cap
2367                .get(2)
2368                .map(|m| m.as_str())
2369                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2370                .unwrap_or("")
2371                .trim();
2372            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2373
2374            let val = if normalized_raw == "true" {
2375                Value::Bool(true)
2376            } else if normalized_raw == "false" {
2377                Value::Bool(false)
2378            } else if let Ok(n) = normalized_raw.parse::<i64>() {
2379                Value::Number(n.into())
2380            } else if let Ok(n) = normalized_raw.parse::<u64>() {
2381                Value::Number(n.into())
2382            } else if let Ok(n) = normalized_raw.parse::<f64>() {
2383                serde_json::Number::from_f64(n)
2384                    .map(Value::Number)
2385                    .unwrap_or(Value::String(normalized_raw.clone()))
2386            } else {
2387                Value::String(normalized_raw)
2388            };
2389
2390            arguments.insert(key, val);
2391        }
2392
2393        results.push(ToolCallResponse {
2394            id: format!("call_{}", rand::random::<u32>()),
2395            call_type: "function".to_string(),
2396            function: ToolCallFn {
2397                name,
2398                arguments: Value::Object(arguments).to_string(),
2399            },
2400        });
2401    }
2402
2403    // -- Format 2: XML (Qwen/Claude style) --
2404    let re_xml_call = Regex::new(
2405        r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
2406    ).unwrap();
2407    let re_xml_param =
2408        Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
2409
2410    for cap in re_xml_call.captures_iter(text) {
2411        let name = cap[1].to_string();
2412        let body = &cap[2];
2413        let mut arguments = serde_json::Map::new();
2414
2415        for p_cap in re_xml_param.captures_iter(body) {
2416            let key = p_cap[1].to_string();
2417            let val_raw = p_cap[2].trim();
2418            let val = if val_raw == "true" {
2419                Value::Bool(true)
2420            } else if val_raw == "false" {
2421                Value::Bool(false)
2422            } else if let Ok(n) = val_raw.parse::<i64>() {
2423                Value::Number(n.into())
2424            } else if let Ok(n) = val_raw.parse::<u64>() {
2425                Value::Number(n.into())
2426            } else {
2427                Value::String(val_raw.to_string())
2428            };
2429            arguments.insert(key, val);
2430        }
2431
2432        if !arguments.is_empty() || name == "clear_context" {
2433            results.push(ToolCallResponse {
2434                id: format!("call_{}", rand::random::<u32>()),
2435                call_type: "function".to_string(),
2436                function: ToolCallFn {
2437                    name,
2438                    arguments: Value::Object(arguments).to_string(),
2439                },
2440            });
2441        }
2442    }
2443
2444    results
2445}
2446
2447pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2448    let trimmed = raw.trim();
2449    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2450
2451    let mut value = match serde_json::from_str::<Value>(&candidate) {
2452        Ok(v) => v,
2453        Err(_) => return candidate,
2454    };
2455    normalize_tool_argument_value(tool_name, &mut value);
2456    value.to_string()
2457}
2458
2459fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2460    match value {
2461        Value::String(s) => *s = normalize_string_arg(s),
2462        Value::Array(items) => {
2463            for item in items {
2464                normalize_tool_argument_value(tool_name, item);
2465            }
2466        }
2467        Value::Object(map) => {
2468            for val in map.values_mut() {
2469                normalize_tool_argument_value(tool_name, val);
2470            }
2471            if tool_name == "grep_files" {
2472                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2473                    *pattern = normalize_regex_pattern(pattern);
2474                }
2475            }
2476            for key in ["path", "extension", "query", "command", "reason"] {
2477                if let Some(Value::String(s)) = map.get_mut(key) {
2478                    *s = normalize_string_arg(s);
2479                }
2480            }
2481        }
2482        _ => {}
2483    }
2484}
2485
2486fn unwrap_json_string_once(input: &str) -> Option<String> {
2487    if input.len() < 2 {
2488        return None;
2489    }
2490    let first = input.chars().next()?;
2491    let last = input.chars().last()?;
2492    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2493        return None;
2494    }
2495    let inner = &input[1..input.len() - 1];
2496    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2497    Some(unescaped.trim().to_string())
2498}
2499
2500fn normalize_string_arg(input: &str) -> String {
2501    let mut out = input.trim().to_string();
2502    while out.len() >= 2 {
2503        let mut changed = false;
2504        for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2505            if out.starts_with(start) && out.ends_with(end) {
2506                out = out[start.len()..out.len() - end.len()].trim().to_string();
2507                changed = true;
2508                break;
2509            }
2510        }
2511        if !changed {
2512            break;
2513        }
2514    }
2515    out
2516}
2517
2518fn normalize_regex_pattern(input: &str) -> String {
2519    let out = normalize_string_arg(input);
2520    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2521        out[1..out.len() - 1].to_string()
2522    } else {
2523        out
2524    }
2525}
2526
2527fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2528    let mut system_blocks = Vec::new();
2529    let mut prepared = Vec::new();
2530    let mut seeded = false;
2531
2532    for message in messages {
2533        if message.role == "system" {
2534            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2535                .trim()
2536                .to_string();
2537            if !cleaned.is_empty() {
2538                system_blocks.push(cleaned);
2539            }
2540            continue;
2541        }
2542
2543        let mut clone = message.clone();
2544        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2545
2546        if !seeded && message.role == "user" {
2547            let mut merged = String::new();
2548            if !system_blocks.is_empty() {
2549                merged.push_str("System instructions for this turn:\n");
2550                merged.push_str(&system_blocks.join("\n\n"));
2551                merged.push_str("\n\n");
2552            }
2553            merged.push_str(clone.content.as_str());
2554            clone.content = MessageContent::Text(merged);
2555            seeded = true;
2556        }
2557
2558        prepared.push(clone);
2559    }
2560
2561    if !seeded && !system_blocks.is_empty() {
2562        prepared.insert(
2563            0,
2564            ChatMessage::user(&format!(
2565                "System instructions for this turn:\n{}",
2566                system_blocks.join("\n\n")
2567            )),
2568        );
2569    }
2570
2571    prepared
2572}
2573
2574fn strip_legacy_turn_wrappers(text: &str) -> String {
2575    text.replace("<|turn>system\n", "")
2576        .replace("<|turn>user\n", "")
2577        .replace("<|turn>assistant\n", "")
2578        .replace("<|turn>tool\n", "")
2579        .replace("<turn|>", "")
2580        .trim()
2581        .to_string()
2582}
2583
2584pub fn strip_native_tool_call_text(text: &str) -> String {
2585    use regex::Regex;
2586    // Format 1: Gemma 4 Native
2587    let re_call = Regex::new(
2588        r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2589    ).unwrap();
2590    // Format 2: XML (Qwen/Claude style)
2591    let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
2592    let re_response =
2593        Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2594            .unwrap();
2595    let without_calls = re_call.replace_all(text, "");
2596    let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
2597    re_response
2598        .replace_all(without_xml.as_ref(), "")
2599        .trim()
2600        .to_string()
2601}
2602
2603#[cfg(test)]
2604mod tests {
2605    use super::*;
2606
2607    #[test]
2608    fn system_prompt_includes_running_hematite_version() {
2609        let engine = InferenceEngine::new(
2610            "http://localhost:1234/v1".to_string(),
2611            "strategist".to_string(),
2612            0,
2613        )
2614        .expect("engine");
2615
2616        let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2617        assert!(system.contains(crate::HEMATITE_VERSION));
2618    }
2619
2620    #[test]
2621    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2622        let text = r#"<|channel>thought
2623Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2624<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2625
2626        let calls = extract_native_tool_calls(text);
2627        assert_eq!(calls.len(), 1);
2628        assert_eq!(calls[0].function.name, "read_file");
2629
2630        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2631        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2632        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2633        assert_eq!(
2634            args.get("path").and_then(|v| v.as_str()),
2635            Some("src/ui/tui.rs")
2636        );
2637
2638        let stripped = strip_native_tool_call_text(text);
2639        assert!(!stripped.contains("<|tool_call"));
2640        assert!(!stripped.contains("<tool_call|>"));
2641    }
2642
2643    #[test]
2644    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2645        let text = r#"<|channel>thought
2646Planning.
2647<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2648Mapped src.
2649<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2650Read main.
2651<channel|>"#;
2652
2653        let calls = extract_native_tool_calls(text);
2654        assert_eq!(calls.len(), 2);
2655        assert_eq!(calls[0].function.name, "list_files");
2656        assert_eq!(calls[1].function.name, "read_file");
2657
2658        let stripped = strip_native_tool_call_text(text);
2659        assert!(!stripped.contains("<|tool_call"));
2660        assert!(!stripped.contains("<|tool_response"));
2661        assert!(!stripped.contains("<tool_response|>"));
2662    }
2663
2664    #[test]
2665    fn extracts_qwen_xml_tool_calls_from_reasoning() {
2666        let text = r#"Based on the project structure, I need to check the binary.
2667<tool_call>
2668<function=shell>
2669<parameter=command>
2670ls -la hematite.exe
2671</parameter>
2672<parameter=reason>
2673Check if the binary exists
2674</parameter>
2675</function>
2676</tool_call>"#;
2677
2678        let calls = extract_native_tool_calls(text);
2679        assert_eq!(calls.len(), 1);
2680        assert_eq!(calls[0].function.name, "shell");
2681
2682        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2683        assert_eq!(
2684            args.get("command").and_then(|v| v.as_str()),
2685            Some("ls -la hematite.exe")
2686        );
2687        assert_eq!(
2688            args.get("reason").and_then(|v| v.as_str()),
2689            Some("Check if the binary exists")
2690        );
2691
2692        let stripped = strip_native_tool_call_text(text);
2693        assert!(!stripped.contains("<tool_call>"));
2694        assert!(!stripped.contains("<function=shell>"));
2695    }
2696}