hematite/agent/
inference.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7// ── Engine ────────────────────────────────────────────────────────────────────
8
9pub struct InferenceEngine {
10    pub client: reqwest::Client,
11    pub api_url: String,
12    /// Root URL of the LLM provider (e.g. `http://localhost:1234`).
13    /// All non-completions endpoints (models list, health, embeddings) are derived from this.
14    pub base_url: String,
15    pub species: String,
16    pub snark: u8,
17    pub kv_semaphore: Semaphore,
18    /// The model ID currently loaded in LM Studio (auto-detected on boot).
19    pub model: std::sync::RwLock<String>,
20    /// Context window length in tokens (auto-detected from LM Studio, default 32768).
21    pub context_length: std::sync::atomic::AtomicUsize,
22    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23    /// Optional model ID for worker-level tasks (Swarms / research).
24    pub worker_model: Option<String>,
25    /// Opt-in Gemma-native request shaping. Off by default.
26    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27    /// Global cancellation token for hard-interrupting the inference stream.
28    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_hematite_native_model(model: &str) -> bool {
32    let lower = model.to_ascii_lowercase();
33    lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37    is_hematite_native_model(model) && engine.gemma_native_formatting_enabled()
38}
39
40// ── OpenAI Tool Definition ────────────────────────────────────────────────────
41
42#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44    #[serde(rename = "type")]
45    pub tool_type: String,
46    pub function: ToolFunction,
47    #[serde(skip_serializing, skip_deserializing)]
48    pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53    pub name: String,
54    pub description: String,
55    pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60    RepoRead,
61    RepoWrite,
62    Runtime,
63    Architecture,
64    Toolchain,
65    Verification,
66    Git,
67    Research,
68    Vision,
69    Lsp,
70    Workflow,
71    External,
72    Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77    pub category: ToolCategory,
78    pub mutates_workspace: bool,
79    pub external_surface: bool,
80    pub trust_sensitive: bool,
81    pub read_only_friendly: bool,
82    pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86    if name.starts_with("mcp__") {
87        let lower = name.to_ascii_lowercase();
88        let mutates_workspace = [
89            "__edit",
90            "__write",
91            "__create",
92            "__move",
93            "__delete",
94            "__remove",
95            "__rename",
96            "__replace",
97            "__patch",
98        ]
99        .iter()
100        .any(|needle| lower.contains(needle));
101        return ToolMetadata {
102            category: ToolCategory::External,
103            mutates_workspace,
104            external_surface: true,
105            trust_sensitive: true,
106            read_only_friendly: !mutates_workspace,
107            plan_scope: false,
108        };
109    }
110
111    match name {
112        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113            category: ToolCategory::RepoRead,
114            mutates_workspace: false,
115            external_surface: false,
116            trust_sensitive: false,
117            read_only_friendly: true,
118            plan_scope: true,
119        },
120        "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121            category: ToolCategory::RepoWrite,
122            mutates_workspace: true,
123            external_surface: false,
124            trust_sensitive: true,
125            read_only_friendly: false,
126            plan_scope: true,
127        },
128        "trace_runtime_flow" => ToolMetadata {
129            category: ToolCategory::Architecture,
130            mutates_workspace: false,
131            external_surface: false,
132            trust_sensitive: false,
133            read_only_friendly: true,
134            plan_scope: false,
135        },
136        "describe_toolchain" => ToolMetadata {
137            category: ToolCategory::Toolchain,
138            mutates_workspace: false,
139            external_surface: false,
140            trust_sensitive: false,
141            read_only_friendly: true,
142            plan_scope: false,
143        },
144        "shell" => ToolMetadata {
145            category: ToolCategory::Runtime,
146            mutates_workspace: true,
147            external_surface: false,
148            trust_sensitive: true,
149            read_only_friendly: false,
150            plan_scope: false,
151        },
152        "inspect_host" => ToolMetadata {
153            category: ToolCategory::Runtime,
154            mutates_workspace: false,
155            external_surface: false,
156            trust_sensitive: false,
157            read_only_friendly: true,
158            plan_scope: false,
159        },
160        "resolve_host_issue" => ToolMetadata {
161            category: ToolCategory::Runtime,
162            mutates_workspace: true,
163            external_surface: true,
164            trust_sensitive: true,
165            read_only_friendly: false,
166            plan_scope: false,
167        },
168        "run_hematite_maintainer_workflow" => ToolMetadata {
169            category: ToolCategory::Workflow,
170            mutates_workspace: true,
171            external_surface: false,
172            trust_sensitive: true,
173            read_only_friendly: false,
174            plan_scope: false,
175        },
176        "run_workspace_workflow" => ToolMetadata {
177            category: ToolCategory::Workflow,
178            mutates_workspace: true,
179            external_surface: false,
180            trust_sensitive: true,
181            read_only_friendly: false,
182            plan_scope: false,
183        },
184        "verify_build" => ToolMetadata {
185            category: ToolCategory::Verification,
186            mutates_workspace: false,
187            external_surface: false,
188            trust_sensitive: false,
189            read_only_friendly: true,
190            plan_scope: true,
191        },
192        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
193            ToolMetadata {
194                category: ToolCategory::Git,
195                mutates_workspace: true,
196                external_surface: false,
197                trust_sensitive: true,
198                read_only_friendly: false,
199                plan_scope: false,
200            }
201        }
202        "research_web" | "fetch_docs" => ToolMetadata {
203            category: ToolCategory::Research,
204            mutates_workspace: false,
205            external_surface: false,
206            trust_sensitive: false,
207            read_only_friendly: true,
208            plan_scope: false,
209        },
210        "vision_analyze" => ToolMetadata {
211            category: ToolCategory::Vision,
212            mutates_workspace: false,
213            external_surface: false,
214            trust_sensitive: false,
215            read_only_friendly: true,
216            plan_scope: false,
217        },
218        "lsp_definitions"
219        | "lsp_references"
220        | "lsp_hover"
221        | "lsp_rename_symbol"
222        | "lsp_get_diagnostics"
223        | "lsp_search_symbol" => ToolMetadata {
224            category: ToolCategory::Lsp,
225            mutates_workspace: false,
226            external_surface: false,
227            trust_sensitive: false,
228            read_only_friendly: true,
229            plan_scope: false,
230        },
231        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
232            category: ToolCategory::Workflow,
233            mutates_workspace: false,
234            external_surface: false,
235            trust_sensitive: false,
236            read_only_friendly: true,
237            plan_scope: true,
238        },
239        "manage_tasks" => ToolMetadata {
240            category: ToolCategory::Workflow,
241            mutates_workspace: false,
242            external_surface: false,
243            trust_sensitive: false,
244            read_only_friendly: true,
245            plan_scope: false,
246        },
247        _ => ToolMetadata {
248            category: ToolCategory::Other,
249            mutates_workspace: false,
250            external_surface: false,
251            trust_sensitive: false,
252            read_only_friendly: true,
253            plan_scope: false,
254        },
255    }
256}
257
258// ── Message types ─────────────────────────────────────────────────────────────
259
260/// OpenAI-compatible chat message. Content can be a string (legacy) or a
261/// Vec of ContentPart (multimodal).
262#[derive(Serialize, Deserialize, Clone, Debug)]
263pub struct ChatMessage {
264    pub role: String,
265    /// Support both simple string content and complex multi-part content (Vision).
266    pub content: MessageContent,
267    /// Assistant messages may have tool calls. Default to empty vec, not null.
268    #[serde(default, skip_serializing_if = "Vec::is_empty")]
269    pub tool_calls: Vec<ToolCallResponse>,
270    /// Tool message references the original call.
271    #[serde(skip_serializing_if = "Option::is_none")]
272    pub tool_call_id: Option<String>,
273    /// Tool message name.
274    #[serde(skip_serializing_if = "Option::is_none")]
275    pub name: Option<String>,
276}
277
278#[derive(Serialize, Deserialize, Clone, Debug)]
279#[serde(untagged)]
280pub enum MessageContent {
281    Text(String),
282    Parts(Vec<ContentPart>),
283}
284
285#[derive(Serialize, Deserialize, Clone, Debug)]
286#[serde(tag = "type")]
287pub enum ContentPart {
288    #[serde(rename = "text")]
289    Text { text: String },
290    #[serde(rename = "image_url")]
291    ImageUrl { image_url: ImageUrlSource },
292}
293
294#[derive(Serialize, Deserialize, Clone, Debug)]
295pub struct ImageUrlSource {
296    pub url: String,
297}
298
299impl Default for MessageContent {
300    fn default() -> Self {
301        MessageContent::Text(String::new())
302    }
303}
304
305impl MessageContent {
306    pub fn as_str(&self) -> &str {
307        match self {
308            MessageContent::Text(s) => s,
309            MessageContent::Parts(parts) => {
310                for part in parts {
311                    if let ContentPart::Text { text } = part {
312                        return text;
313                    }
314                }
315                ""
316            }
317        }
318    }
319}
320
321impl ChatMessage {
322    pub fn system(content: &str) -> Self {
323        Self {
324            role: "system".into(),
325            content: MessageContent::Text(content.into()),
326            tool_calls: Vec::new(),
327            tool_call_id: None,
328            name: None,
329        }
330    }
331    pub fn user(content: &str) -> Self {
332        Self {
333            role: "user".into(),
334            content: MessageContent::Text(content.into()),
335            tool_calls: Vec::new(),
336            tool_call_id: None,
337            name: None,
338        }
339    }
340    pub fn user_with_image(text: &str, image_url: &str) -> Self {
341        let mut text_parts = text.to_string();
342        if !text_parts.contains("<|image|>") {
343            text_parts.push_str(" <|image|>");
344        }
345        Self {
346            role: "user".into(),
347            content: MessageContent::Parts(vec![
348                ContentPart::Text { text: text_parts },
349                ContentPart::ImageUrl {
350                    image_url: ImageUrlSource {
351                        url: image_url.into(),
352                    },
353                },
354            ]),
355            tool_calls: Vec::new(),
356            tool_call_id: None,
357            name: None,
358        }
359    }
360    pub fn assistant_text(content: &str) -> Self {
361        Self {
362            role: "assistant".into(),
363            content: MessageContent::Text(content.into()),
364            tool_calls: Vec::new(),
365            tool_call_id: None,
366            name: None,
367        }
368    }
369    pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
370        Self {
371            role: "assistant".into(),
372            content: MessageContent::Text(content.into()),
373            tool_calls: calls,
374            tool_call_id: None,
375            name: None,
376        }
377    }
378    pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
379        Self::tool_result_for_model(tool_call_id, fn_name, content, "")
380    }
381
382    /// Build a tool result message, applying Gemma 4 native markup only when the
383    /// loaded model is actually a Gemma 4 model.
384    pub fn tool_result_for_model(
385        tool_call_id: &str,
386        fn_name: &str,
387        content: &str,
388        model: &str,
389    ) -> Self {
390        let body = if is_hematite_native_model(model) {
391            format!(
392                "<|tool_response>response:{}{}{}<tool_response|>",
393                fn_name, "{", content
394            )
395        } else {
396            content.to_string()
397        };
398        Self {
399            role: "tool".into(),
400            content: MessageContent::Text(body),
401            tool_calls: Vec::new(),
402            tool_call_id: Some(tool_call_id.into()),
403            name: Some(fn_name.into()),
404        }
405    }
406}
407
408// ── Tool call as returned by the model ───────────────────────────────────────
409
410#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallResponse {
412    pub id: String,
413    #[serde(rename = "type")]
414    pub call_type: String,
415    pub function: ToolCallFn,
416}
417
418#[derive(Serialize, Deserialize, Clone, Debug)]
419pub struct ToolCallFn {
420    pub name: String,
421    /// JSON-encoded arguments string (as returned by the API).
422    pub arguments: String,
423}
424
425// ── HTTP request / response shapes ───────────────────────────────────────────
426
427#[derive(Serialize)]
428struct ChatRequest {
429    model: String,
430    messages: Vec<ChatMessage>,
431    temperature: f32,
432    stream: bool,
433    #[serde(skip_serializing_if = "Option::is_none")]
434    tools: Option<Vec<ToolDefinition>>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ChatResponse {
439    choices: Vec<ResponseChoice>,
440    usage: Option<TokenUsage>,
441}
442
443#[derive(Deserialize, Debug, Clone)]
444pub struct TokenUsage {
445    pub prompt_tokens: usize,
446    pub completion_tokens: usize,
447    pub total_tokens: usize,
448    #[serde(default)]
449    pub prompt_cache_hit_tokens: usize,
450    #[serde(default)]
451    pub cache_read_input_tokens: usize,
452}
453
454#[derive(Deserialize, Debug)]
455struct ResponseChoice {
456    message: ResponseMessage,
457    #[serde(default)]
458    finish_reason: Option<String>,
459}
460
461#[derive(Deserialize, Debug)]
462struct ResponseMessage {
463    content: Option<String>,
464    tool_calls: Option<Vec<ToolCallResponse>>,
465    /// LM Studio routes Qwen3 thinking-mode output here instead of wrapping
466    /// it in <think> tags inside `content`. When tool calls are generated
467    /// inside a think block, they end up here rather than in `tool_calls`.
468    #[serde(default)]
469    reasoning_content: Option<String>,
470}
471
472const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
473const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
474
475fn is_tiny_context_window(context_length: usize) -> bool {
476    context_length <= 8_192
477}
478
479fn is_compact_context_window(context_length: usize) -> bool {
480    context_length > 8_192 && context_length <= 49_152
481}
482
483pub fn is_compact_context_window_pub(context_length: usize) -> bool {
484    is_compact_context_window(context_length)
485}
486
487fn is_provider_context_limit_detail(lower: &str) -> bool {
488    (lower.contains("n_keep") && lower.contains("n_ctx"))
489        || lower.contains("context length")
490        || lower.contains("keep from the initial prompt")
491        || lower.contains("prompt is greater than the context length")
492        || lower.contains("exceeds the context window")
493}
494
495fn classify_runtime_failure_tag(detail: &str) -> &'static str {
496    let lower = detail.to_ascii_lowercase();
497    if lower.contains("context_window_blocked")
498        || lower.contains("context ceiling reached")
499        || lower.contains("exceeds the")
500        || is_provider_context_limit_detail(&lower)
501    {
502        "context_window"
503    } else if lower.contains("empty response from model")
504        || lower.contains("model returned an empty response")
505    {
506        "empty_model_response"
507    } else if lower.contains("action blocked:")
508        || lower.contains("access denied")
509        || lower.contains("declined by user")
510    {
511        "tool_policy_blocked"
512    } else {
513        "provider_degraded"
514    }
515}
516
517fn runtime_failure_guidance(tag: &str) -> &'static str {
518    match tag {
519        "context_window" => {
520            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
521        }
522        "empty_model_response" => {
523            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
524        }
525        "tool_policy_blocked" => {
526            "Stay inside the allowed workflow or switch modes before retrying."
527        }
528        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
529    }
530}
531
532fn format_runtime_failure_message(detail: &str) -> String {
533    let tag = classify_runtime_failure_tag(detail);
534    format!(
535        "[failure:{}] {} Detail: {}",
536        tag,
537        runtime_failure_guidance(tag),
538        detail.trim()
539    )
540}
541
542#[derive(Debug, Clone, Copy, PartialEq, Eq)]
543pub enum ProviderRuntimeState {
544    Booting,
545    Live,
546    Recovering,
547    Degraded,
548    ContextWindow,
549    EmptyResponse,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum McpRuntimeState {
554    Unconfigured,
555    Healthy,
556    Degraded,
557    Failed,
558}
559
560#[derive(Debug, Clone, Copy, PartialEq, Eq)]
561pub enum OperatorCheckpointState {
562    Idle,
563    RecoveringProvider,
564    BudgetReduced,
565    HistoryCompacted,
566    BlockedContextWindow,
567    BlockedPolicy,
568    BlockedRecentFileEvidence,
569    BlockedExactLineWindow,
570    BlockedToolLoop,
571    BlockedVerification,
572}
573
574impl OperatorCheckpointState {
575    pub fn label(self) -> &'static str {
576        match self {
577            OperatorCheckpointState::Idle => "idle",
578            OperatorCheckpointState::RecoveringProvider => "recovering_provider",
579            OperatorCheckpointState::BudgetReduced => "budget_reduced",
580            OperatorCheckpointState::HistoryCompacted => "history_compacted",
581            OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
582            OperatorCheckpointState::BlockedPolicy => "blocked_policy",
583            OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
584            OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
585            OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
586            OperatorCheckpointState::BlockedVerification => "blocked_verification",
587        }
588    }
589}
590
591fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
592    match tag {
593        "context_window" => ProviderRuntimeState::ContextWindow,
594        "empty_model_response" => ProviderRuntimeState::EmptyResponse,
595        _ => ProviderRuntimeState::Degraded,
596    }
597}
598
599fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
600    match tag {
601        "context_window" => {
602            "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
603                .to_string()
604        }
605        "empty_model_response" => {
606            "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
607                .to_string()
608        }
609        "tool_policy_blocked" => {
610            "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
611                .to_string()
612        }
613        _ => {
614            let mut excerpt = detail
615                .split_whitespace()
616                .take(12)
617                .collect::<Vec<_>>()
618                .join(" ");
619            if excerpt.len() > 110 {
620                excerpt.truncate(110);
621                excerpt.push_str("...");
622            }
623            if excerpt.is_empty() {
624                "LM Studio degraded; Hematite will retry once before surfacing a failure."
625                    .to_string()
626            } else {
627                format!("LM Studio degraded: {}", excerpt)
628            }
629        }
630    }
631}
632
633// ── Events pushed to the TUI ──────────────────────────────────────────────────
634
635#[derive(Debug)]
636pub enum InferenceEvent {
637    /// A text token to append to the current assistant message.
638    Token(String),
639    /// A text token to be displayed on screen but NOT spoken (e.g. startup greeting).
640    MutedToken(String),
641    /// Internal model reasoning (shown in side panel, not dialogue).
642    Thought(String),
643    /// Critical diagnostic feedback from the voice synthesis engine.
644    VoiceStatus(String),
645    /// A tool call is starting – show a status line in the TUI.
646    ToolCallStart {
647        id: String,
648        name: String,
649        args: String,
650    },
651    /// A tool call completed – show result in the TUI.
652    ToolCallResult {
653        id: String,
654        name: String,
655        output: String,
656        is_error: bool,
657    },
658    /// A risky tool requires explicit user approval.
659    /// The TUI must send `true` (approved) or `false` (rejected) via `responder`.
660    /// When `diff` is Some, the modal renders a coloured before/after diff preview.
661    ApprovalRequired {
662        id: String,
663        name: String,
664        display: String,
665        /// Pre-formatted diff: lines starting with "- " are removals, "+ " are additions,
666        /// "---" is a file header.  None means a plain high-risk approval (no diff).
667        diff: Option<String>,
668        /// Intent label for mutation protocol (Cyan box trigger).
669        mutation_label: Option<String>,
670        responder: tokio::sync::oneshot::Sender<bool>,
671    },
672    /// The current agent turn is complete.
673    Done,
674    /// Indicates the agent is automatically orchestrating a transition to /implement-plan.
675    ChainImplementPlan,
676    /// An error occurred during inference.
677    Error(String),
678    /// Compact provider/runtime state for the operator surface.
679    ProviderStatus {
680        state: ProviderRuntimeState,
681        summary: String,
682    },
683    /// Typed operator checkpoint/blocker state for SPECULAR and recovery UIs.
684    OperatorCheckpoint {
685        state: OperatorCheckpointState,
686        summary: String,
687    },
688    /// Typed recovery recipe summary for operator/debug surfaces.
689    RecoveryRecipe { summary: String },
690    /// Compact MCP/runtime server health for the operator surface.
691    McpStatus {
692        state: McpRuntimeState,
693        summary: String,
694    },
695    /// Current compaction pressure against the adaptive threshold.
696    CompactionPressure {
697        estimated_tokens: usize,
698        threshold_tokens: usize,
699        percent: u8,
700    },
701    /// Current total prompt-budget pressure against the live context window.
702    PromptPressure {
703        estimated_input_tokens: usize,
704        reserved_output_tokens: usize,
705        estimated_total_tokens: usize,
706        context_length: usize,
707        percent: u8,
708    },
709    /// A generic task progress update (e.g. for single-agent tool execution).
710    TaskProgress {
711        id: String,
712        label: String,
713        progress: u8,
714    },
715    /// Real-time token usage update from the API.
716    UsageUpdate(TokenUsage),
717    /// The current runtime profile detected from LM Studio.
718    RuntimeProfile {
719        model_id: String,
720        context_length: usize,
721    },
722    /// Vein index status after each incremental re-index.
723    VeinStatus {
724        file_count: usize,
725        embedded_count: usize,
726        docs_only: bool,
727    },
728    /// File paths the Vein surfaced as relevant to the current turn.
729    /// Used to populate ACTIVE CONTEXT with retrieval results.
730    VeinContext { paths: Vec<String> },
731    /// A new companion was hatched mid-session via /reroll.
732    SoulReroll {
733        species: String,
734        rarity: String,
735        shiny: bool,
736        personality: String,
737    },
738    /// A "Dive-In" command (cd <dir> && hematite) to be copied to the clipboard.
739    CopyDiveInCommand(String),
740    /// Embed model loaded/unloaded mid-session.
741    EmbedProfile { model_id: Option<String> },
742    /// A single line of live shell output, streamed while the command runs.
743    /// Displayed in the SPECULAR panel so the operator sees progress without
744    /// waiting for the full command to finish.
745    ShellLine(String),
746}
747
748// ── Engine implementation ─────────────────────────────────────────────────────
749
750impl InferenceEngine {
751    pub fn new(
752        api_url: String,
753        species: String,
754        snark: u8,
755    ) -> Result<Self, Box<dyn std::error::Error>> {
756        let client = reqwest::Client::builder()
757            .timeout(std::time::Duration::from_secs(180))
758            .build()?;
759
760        // Extract http://host:port as the base for all non-completions endpoints.
761        let base_url = {
762            let trimmed = api_url.trim_end_matches('/');
763            if let Some(scheme_end) = trimmed.find("://") {
764                let after_scheme = &trimmed[scheme_end + 3..];
765                if let Some(path_start) = after_scheme.find('/') {
766                    format!(
767                        "{}://{}",
768                        &trimmed[..scheme_end],
769                        &after_scheme[..path_start]
770                    )
771                } else {
772                    trimmed.to_string()
773                }
774            } else {
775                trimmed.to_string()
776            }
777        };
778
779        let api_url = if api_url.ends_with("/chat/completions") {
780            api_url
781        } else if api_url.ends_with("/") {
782            format!("{}chat/completions", api_url)
783        } else {
784            format!("{}/chat/completions", api_url)
785        };
786
787        Ok(Self {
788            client,
789            api_url,
790            base_url,
791            species,
792            snark,
793            kv_semaphore: Semaphore::new(3),
794            model: std::sync::RwLock::new(String::new()),
795            context_length: std::sync::atomic::AtomicUsize::new(32_768), // Gemma-4 Sweet Spot (32K)
796            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
797            worker_model: None,
798            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
799            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
800        })
801    }
802
803    pub fn set_gemma_native_formatting(&self, enabled: bool) {
804        self.gemma_native_formatting
805            .store(enabled, std::sync::atomic::Ordering::SeqCst);
806    }
807
808    pub fn gemma_native_formatting_enabled(&self) -> bool {
809        self.gemma_native_formatting
810            .load(std::sync::atomic::Ordering::SeqCst)
811    }
812
813    pub fn current_model(&self) -> String {
814        self.model.read().map(|g| g.clone()).unwrap_or_default()
815    }
816
817    pub fn current_context_length(&self) -> usize {
818        self.context_length
819            .load(std::sync::atomic::Ordering::SeqCst)
820    }
821
822    pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
823        if let Ok(mut guard) = self.model.write() {
824            *guard = model.to_string();
825        }
826        self.context_length
827            .store(context_length, std::sync::atomic::Ordering::SeqCst);
828    }
829
830    /// Returns true if LM Studio is reachable.
831    pub async fn health_check(&self) -> bool {
832        let url = format!("{}/v1/models", self.base_url);
833        match self.client.get(&url).send().await {
834            Ok(resp) => resp.status().is_success(),
835            Err(_) => false,
836        }
837    }
838
839    /// Query /api/v0/models and return the first loaded chat model id.
840    /// Uses /api/v0/models (not /v1/models) because the OpenAI-compat endpoint
841    /// omits the `type` field, making it impossible to distinguish embedding
842    /// models from chat models. Falls back to /v1/models with a name heuristic
843    /// if /api/v0/models is unavailable.
844    /// Returns Some("") when LM Studio is reachable but no chat model is loaded
845    /// so callers can distinguish "offline" (None) from "no chat model" (Some("")).
846    pub async fn get_loaded_model(&self) -> Option<String> {
847        #[derive(Deserialize)]
848        struct ModelList {
849            data: Vec<ModelEntry>,
850        }
851        #[derive(Deserialize)]
852        struct ModelEntry {
853            id: String,
854            #[serde(rename = "type", default)]
855            model_type: String,
856            #[serde(default)]
857            state: String,
858        }
859
860        // Try /api/v0/models first — it has type and state fields.
861        if let Ok(resp) = self
862            .client
863            .get(format!("{}/api/v0/models", self.base_url))
864            .send()
865            .await
866        {
867            if let Ok(list) = resp.json::<ModelList>().await {
868                let chat_model = list
869                    .data
870                    .into_iter()
871                    .find(|m| m.model_type != "embeddings" && m.state == "loaded")
872                    .map(|m| m.id)
873                    .unwrap_or_default();
874                return Some(chat_model);
875            }
876        }
877
878        // Fallback: /v1/models lacks type info — use name heuristic to skip embed models.
879        let resp = self
880            .client
881            .get(format!("{}/v1/models", self.base_url))
882            .send()
883            .await
884            .ok()?;
885        let list: ModelList = resp.json().await.ok()?;
886        Some(
887            list.data
888                .into_iter()
889                .find(|m| !m.id.to_lowercase().contains("embed"))
890                .map(|m| m.id)
891                .unwrap_or_default(),
892        )
893    }
894
895    /// Returns the ID of the first loaded embedding model, if any.
896    /// Uses /api/v0/models which includes `type` and `state` fields.
897    /// The OpenAI-compat /v1/models endpoint omits `type` so cannot be used here.
898    /// Accepts any non-empty state (not just "loaded") to handle LM Studio variants
899    /// where the embed model may report a different state string at startup.
900    pub async fn get_embedding_model(&self) -> Option<String> {
901        #[derive(Deserialize)]
902        struct ModelList {
903            data: Vec<ModelEntry>,
904        }
905        #[derive(Deserialize)]
906        struct ModelEntry {
907            id: String,
908            #[serde(rename = "type", default)]
909            model_type: String,
910            #[serde(default)]
911            state: String,
912        }
913        let resp = self
914            .client
915            .get(format!("{}/api/v0/models", self.base_url))
916            .send()
917            .await
918            .ok()?;
919        let list: ModelList = resp.json().await.ok()?;
920        list.data
921            .into_iter()
922            .find(|m| m.model_type == "embeddings" && m.state == "loaded")
923            .map(|m| m.id)
924    }
925
926    /// Detect the loaded model's context window size.
927    /// Tries LM Studio's `/api/v0/models` endpoint first and prefers the loaded
928    /// model's live `loaded_context_length`, then falls back to older
929    /// `context_length` / `max_context_length` style fields.
930    /// Falls back to a heuristic from the model name, then 32K.
931    pub async fn detect_context_length(&self) -> usize {
932        #[derive(Deserialize)]
933        struct LmStudioModel {
934            id: Option<String>,
935            #[serde(rename = "type", default)]
936            model_type: String,
937            state: Option<String>,
938            loaded_context_length: Option<u64>,
939            context_length: Option<u64>,
940            max_context_length: Option<u64>,
941        }
942        #[derive(Deserialize)]
943        struct LmStudioList {
944            data: Vec<LmStudioModel>,
945        }
946
947        // Check api/v0/models (LM Studio specific)
948        if let Ok(resp) = self
949            .client
950            .get(format!("{}/api/v0/models", self.base_url))
951            .send()
952            .await
953        {
954            if let Ok(list) = resp.json::<LmStudioList>().await {
955                let target_model = self.current_model().to_ascii_lowercase();
956                // Never select embedding models for context-length detection.
957                let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
958                let loaded = list
959                    .data
960                    .iter()
961                    .find(|m| {
962                        non_embed(m)
963                            && m.state.as_deref() == Some("loaded")
964                            && m.id
965                                .as_deref()
966                                .map(|id| id.eq_ignore_ascii_case(&target_model))
967                                .unwrap_or(false)
968                    })
969                    .or_else(|| {
970                        list.data
971                            .iter()
972                            .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
973                    })
974                    .or_else(|| {
975                        list.data.iter().find(|m| {
976                            non_embed(m)
977                                && m.id
978                                    .as_deref()
979                                    .map(|id| id.eq_ignore_ascii_case(&target_model))
980                                    .unwrap_or(false)
981                        })
982                    })
983                    .or_else(|| list.data.iter().find(|m| non_embed(m)));
984
985                if let Some(model) = loaded {
986                    if let Some(ctx) = model.loaded_context_length {
987                        if ctx > 0 {
988                            return ctx as usize;
989                        }
990                    }
991                    if let Some(ctx) = model.context_length {
992                        if ctx > 0 {
993                            return ctx as usize;
994                        }
995                    }
996                    if let Some(ctx) = model.max_context_length {
997                        if ctx > 0 && ctx <= 32_768 {
998                            return ctx as usize;
999                        }
1000                    }
1001                }
1002            }
1003        }
1004
1005        // Heuristic fallback:
1006        // If "gemma-4" is detected, we target 32,768 as the baseline standard,
1007        // acknowledging that 131,072 is available for High-Capacity tasks.
1008        if self.current_model().to_lowercase().contains("gemma-4") {
1009            return 32_768;
1010        }
1011
1012        32_768
1013    }
1014
1015    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
1016        let previous_model = self.current_model();
1017        let previous_context = self.current_context_length();
1018
1019        let detected_model = match self.get_loaded_model().await {
1020            Some(m) if !m.is_empty() => m,            // coding model found
1021            Some(_) => "no model loaded".to_string(), // reachable but no coding model
1022            None => previous_model.clone(),           // LM Studio offline
1023        };
1024
1025        if !detected_model.is_empty() && detected_model != previous_model {
1026            if let Ok(mut guard) = self.model.write() {
1027                *guard = detected_model.clone();
1028            }
1029        }
1030
1031        let detected_context = self.detect_context_length().await;
1032        let effective_model = if detected_model.is_empty() {
1033            previous_model.clone()
1034        } else {
1035            detected_model
1036        };
1037
1038        let changed = effective_model != previous_model || detected_context != previous_context;
1039        self.set_runtime_profile(&effective_model, detected_context);
1040
1041        Some((effective_model, detected_context, changed))
1042    }
1043
1044    pub fn build_system_prompt(
1045        &self,
1046        snark: u8,
1047        chaos: u8,
1048        brief: bool,
1049        professional: bool,
1050        tools: &[ToolDefinition],
1051        reasoning_history: Option<&str>,
1052        mcp_tools: &[crate::agent::mcp::McpTool],
1053    ) -> String {
1054        let mut sys = self.build_system_prompt_legacy(
1055            snark,
1056            chaos,
1057            brief,
1058            professional,
1059            tools,
1060            reasoning_history,
1061        );
1062
1063        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1064            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1065            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1066            for tool in mcp_tools {
1067                let description = tool
1068                    .description
1069                    .as_deref()
1070                    .unwrap_or("No description provided.");
1071                sys.push_str(&format!("- {}: {}\n", tool.name, description));
1072            }
1073        }
1074
1075        sys
1076    }
1077
1078    pub fn build_system_prompt_legacy(
1079        &self,
1080        snark: u8,
1081        _chaos: u8,
1082        brief: bool,
1083        professional: bool,
1084        tools: &[ToolDefinition],
1085        reasoning_history: Option<&str>,
1086    ) -> String {
1087        let current_context_length = self.current_context_length();
1088        if is_tiny_context_window(current_context_length) {
1089            return self.build_system_prompt_tiny(brief, professional);
1090        }
1091        if is_compact_context_window(current_context_length) {
1092            return self.build_system_prompt_compact(brief, professional, tools);
1093        }
1094
1095        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
1096        let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1097                                     - You are Hematite, a local coding system working on the user's machine.\n\
1098                                     - The running Hematite build is ");
1099        sys.push_str(&crate::hematite_version_display());
1100        sys.push_str(".\n\
1101                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1102                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1103                                     - For simple questions, answer briefly in plain language.\n\
1104                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1105                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1106                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1107                                     - Keep internal reasoning inside channel delimiters.\n\
1108                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1109                                     <turn|>\n\n");
1110
1111        if let Some(history) = reasoning_history {
1112            if !history.is_empty() {
1113                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1114                sys.push_str(history);
1115                sys.push_str("\n\n");
1116            }
1117        }
1118
1119        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
1120        if brief {
1121            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1122                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1123                          - Depth: Surface-level verification only.\n\n");
1124        } else {
1125            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1126                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1127                          - Depth: Full multi-step derivation required.\n\n");
1128        }
1129
1130        // IDENTITY & ENVIRONMENT
1131        let os = std::env::consts::OS;
1132        if professional {
1133            sys.push_str(&format!(
1134                "You are Hematite, a local coding system running on {}. \
1135                 The TUI is one interface layer, not your whole identity. \
1136                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1137                 Skip filler and keep the focus on the work.\n",
1138                os
1139            ));
1140        } else {
1141            sys.push_str(&format!(
1142                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1143                 The terminal UI is only one surface of the system. \
1144                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1145                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1146                self.species, snark, os
1147            ));
1148        }
1149
1150        // Inject loaded model and context window so the model knows its own budget.
1151        let current_model = self.current_model();
1152        if !current_model.is_empty() {
1153            sys.push_str(&format!(
1154                "Loaded model: {} | Context window: {} tokens. \
1155                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1156                current_model, current_context_length
1157            ));
1158            if is_hematite_native_model(&current_model) {
1159                sys.push_str(
1160                    "Sovereign native note: prefer exact tool JSON with no extra prose when calling tools. \
1161                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1162                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1163                );
1164            }
1165        } else {
1166            sys.push_str(&format!(
1167                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1168                current_context_length
1169            ));
1170        }
1171
1172        // PROTOCOL & TOOLS
1173        let shell_desc = if cfg!(target_os = "windows") {
1174            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1175             - Use ONLY for builds, tests, or file migrations. \n\
1176             - You MUST use the `powershell` tool directly. \n\
1177             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1178        } else {
1179            "[EXTERNAL SHELL]: `bash` (Unix).\n\
1180             - Use ONLY for builds, tests, or file migrations. \n\
1181             - NEVER wrap bash in other shells. \n\n"
1182        };
1183
1184        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1185                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1186                      - These are the ONLY way to explore and modify code. \n\
1187                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1188        sys.push_str(shell_desc);
1189
1190        // ANTI-LOOPING & SELF-AUDIT
1191        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1192                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1193
1194        // Consolidated: All directives are now handled by the authoritative prompt.rs builder.
1195        sys.push_str("## TURN ADVISORY\n");
1196        if brief {
1197            sys.push_str("- BRIEF MODE: Respond with ONE concise sentence/block unless more code is required.\n");
1198        }
1199        sys.push_str("- INTERNAL REASONING: Plan your move in the thought channel first.\n");
1200
1201        // Scaffolding protocol — enforces build validation after project creation.
1202        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1203            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1204            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1205            4. Fix all errors before declaring success.\n\n\
1206            ## PRE-FLIGHT SCOPING PROTOCOL\n\
1207            Before attempting any multi-file task or complex refactor:\n\
1208            1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1209            2. Use `auto_pin_context` to keep those files in active context.\n\
1210            3. Only then proceed to deeper edits or research.\n\n\
1211            ## REFACTORING PROTOCOL\n\
1212            When modifying existing code or renaming symbols:\n\
1213            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1214            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1215            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1216
1217        // Inject CLAUDE.md / instruction files from the project directory.
1218        sys.push_str(&load_instruction_files());
1219
1220        // Inject cross-session memories synthesized by DeepReflect.
1221        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1222
1223        // Native Gemma-4 Tool Declarations
1224        if !tools.is_empty() {
1225            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1226            for tool in tools {
1227                let schema = serde_json::to_string(&tool.function.parameters)
1228                    .unwrap_or_else(|_| "{}".to_string());
1229                sys.push_str(&format!(
1230                    "<|tool>declaration:{}{}{}<tool|>\n",
1231                    tool.function.name, "{", schema
1232                ));
1233                sys.push_str(&format!("// {})\n", tool.function.description));
1234            }
1235        }
1236
1237        sys
1238    }
1239
1240    fn build_system_prompt_compact(
1241        &self,
1242        brief: bool,
1243        professional: bool,
1244        tools: &[ToolDefinition],
1245    ) -> String {
1246        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
1247        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
1248        let current_model = self.current_model();
1249        let current_context_length = self.current_context_length();
1250        let os = std::env::consts::OS;
1251
1252        let mut sys = String::from("<|turn>system\n<|think|>\n");
1253        sys.push_str(&format!(
1254            "You are Hematite {}, a local coding harness working on the user's machine.\n",
1255            crate::hematite_version_display()
1256        ));
1257        if professional {
1258            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1259        } else {
1260            sys.push_str(&format!(
1261                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1262                self.species
1263            ));
1264        }
1265        sys.push_str(&format!(
1266            "Model: {} | Context: {} tokens. Keep turns focused.\n",
1267            current_model, current_context_length
1268        ));
1269        if is_hematite_native_model(&current_model) {
1270            sys.push_str(
1271                "Sovereign native: use exact tool JSON. No extra prose in tool calls. \
1272                 Raw regex patterns in grep_files, no slash delimiters.\n",
1273            );
1274        }
1275        if cfg!(target_os = "windows") {
1276            sys.push_str(&format!(
1277                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1278                os
1279            ));
1280        } else {
1281            sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1282        }
1283        if brief {
1284            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1285        }
1286
1287        sys.push_str(
1288            "\nCORE RULES:\n\
1289             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1290             - Verify after edits: run `verify_build` after code changes, before committing.\n\
1291             - One tool at a time. Do not batch unrelated tool calls.\n\
1292             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1293             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1294             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1295        );
1296
1297        if !tools.is_empty() {
1298            sys.push_str("\n# AVAILABLE TOOLS\n");
1299            for tool in tools {
1300                let desc: String = tool.function.description.chars().take(120).collect();
1301                sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1302            }
1303        }
1304
1305        sys.push_str("<turn|>\n");
1306        sys
1307    }
1308
1309    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1310        let current_model = self.current_model();
1311        let current_context_length = self.current_context_length();
1312        let os = std::env::consts::OS;
1313        let mut sys = format!(
1314            "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1315            crate::hematite_version_display()
1316        );
1317        if professional {
1318            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1319        } else {
1320            sys.push_str(&format!(
1321                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1322                self.species
1323            ));
1324        }
1325        if !current_model.is_empty() {
1326            sys.push_str(&format!(
1327                "Loaded model: {} | Context window: {} tokens.\n",
1328                current_model, current_context_length
1329            ));
1330        } else {
1331            sys.push_str(&format!(
1332                "Context window: {} tokens.\n",
1333                current_context_length
1334            ));
1335        }
1336        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1337        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1338        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1339        if cfg!(target_os = "windows") {
1340            sys.push_str(&format!(
1341                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1342                os
1343            ));
1344        } else {
1345            sys.push_str(&format!(
1346                "You are running on {}. Use the native Unix shell conventions.\n",
1347                os
1348            ));
1349        }
1350        if brief {
1351            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1352        }
1353        if is_hematite_native_model(&current_model) {
1354            sys.push_str(
1355                "Sovereign native note: use exact tool JSON with no extra prose when calling tools.\n",
1356            );
1357        }
1358        sys.push_str("<turn|>\n");
1359        sys
1360    }
1361
1362    // ── Non-streaming call (used for agentic turns with tool support) ─────────
1363
1364    /// Send messages to the model. Returns (text_content, tool_calls).
1365    /// Exactly one of the two will be Some on a successful response.
1366    pub async fn call_with_tools(
1367        &self,
1368        messages: &[ChatMessage],
1369        tools: &[ToolDefinition],
1370        // Override the model ID for this call. None = use the live runtime model.
1371        model_override: Option<&str>,
1372    ) -> Result<
1373        (
1374            Option<String>,
1375            Option<Vec<ToolCallResponse>>,
1376            Option<TokenUsage>,
1377            Option<String>,
1378        ),
1379        String,
1380    > {
1381        let _permit = self
1382            .kv_semaphore
1383            .acquire()
1384            .await
1385            .map_err(|e| e.to_string())?;
1386
1387        let current_model = self.current_model();
1388        let model = model_override.unwrap_or(current_model.as_str()).to_string();
1389        let filtered_tools = if cfg!(target_os = "windows") {
1390            tools
1391                .iter()
1392                .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1393                .cloned()
1394                .collect::<Vec<_>>()
1395        } else {
1396            tools.to_vec()
1397        };
1398
1399        let request_messages = if should_use_native_formatting(self, &model) {
1400            prepare_gemma_native_messages(messages)
1401        } else {
1402            messages.to_vec()
1403        };
1404
1405        // In compact context windows, restrict tools to the core coding set.
1406        // Full schemas for 36+ tools add 10k+ tokens via the model's chat template (e.g. Gemma 4).
1407        // Sending a small core set keeps schemas available for structured tool-call dispatch
1408        // while staying within the 16k budget.
1409        const COMPACT_CORE_TOOLS: &[&str] = &[
1410            "read_file",
1411            "inspect_lines",
1412            "edit_file",
1413            "write_file",
1414            "grep_files",
1415            "list_files",
1416            "verify_build",
1417            "shell",
1418        ];
1419        let effective_tools = if is_compact_context_window(self.current_context_length()) {
1420            let core: Vec<_> = filtered_tools
1421                .iter()
1422                .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1423                .cloned()
1424                .collect();
1425            if core.is_empty() {
1426                None
1427            } else {
1428                Some(core)
1429            }
1430        } else if filtered_tools.is_empty() {
1431            None
1432        } else {
1433            Some(filtered_tools)
1434        };
1435
1436        let request = ChatRequest {
1437            model: model.clone(),
1438            messages: request_messages,
1439            temperature: 0.2,
1440            stream: false,
1441            tools: effective_tools,
1442        };
1443
1444        // Exponential backoff: retry up to 3× on 5xx / timeout / connect errors.
1445        preflight_chat_request(
1446            &model,
1447            &request.messages,
1448            request.tools.as_deref().unwrap_or(&[]),
1449            self.current_context_length(),
1450        )?;
1451
1452        let mut last_err = String::new();
1453        let mut response_opt: Option<reqwest::Response> = None;
1454        for attempt in 0..3u32 {
1455            match self.client.post(&self.api_url).json(&request).send().await {
1456                Ok(res) if res.status().is_success() => {
1457                    response_opt = Some(res);
1458                    break;
1459                }
1460                Ok(res) if res.status().as_u16() >= 500 => {
1461                    last_err = format!("LM Studio error {}", res.status());
1462                }
1463                Ok(res) => {
1464                    // 4xx — don't retry
1465                    let status = res.status();
1466                    let body = res.text().await.unwrap_or_default();
1467                    let preview = &body[..body.len().min(300)];
1468                    return Err(format!("LM Studio error {}: {}", status, preview));
1469                }
1470                Err(e) if e.is_timeout() || e.is_connect() => {
1471                    last_err = format!("Request failed: {}", e);
1472                }
1473                Err(e) => return Err(format!("Request failed: {}", e)),
1474            }
1475            if attempt < 2 {
1476                let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1477                tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1478            }
1479        }
1480        let res = response_opt
1481            .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1482
1483        let body: ChatResponse = res
1484            .json()
1485            .await
1486            .map_err(|e| format!("Response parse error: {}", e))?;
1487
1488        if let Some(usage) = &body.usage {
1489            let mut econ = self.economics.lock().unwrap();
1490            econ.input_tokens += usage.prompt_tokens;
1491            econ.output_tokens += usage.completion_tokens;
1492        }
1493
1494        let choice = body
1495            .choices
1496            .into_iter()
1497            .next()
1498            .ok_or_else(|| "Empty response from model".to_string())?;
1499
1500        let finish_reason = choice.finish_reason;
1501        let mut tool_calls = choice.message.tool_calls;
1502        let mut content = choice.message.content;
1503
1504        // Gemma-4 Fallback: If the model outputs native <|tool_call|> tags in the text content,
1505        // extract them and treat them as valid tool calls.
1506        if let Some(raw_content) = &content {
1507            let native_calls = extract_native_tool_calls(raw_content);
1508            if !native_calls.is_empty() {
1509                let mut existing = tool_calls.unwrap_or_default();
1510                existing.extend(native_calls);
1511                tool_calls = Some(existing);
1512                let stripped = strip_native_tool_call_text(raw_content);
1513                content = if stripped.trim().is_empty() {
1514                    None
1515                } else {
1516                    Some(stripped)
1517                };
1518            }
1519        }
1520
1521        if is_hematite_native_model(&model) {
1522            if let Some(calls) = tool_calls.as_mut() {
1523                for call in calls.iter_mut() {
1524                    call.function.arguments = normalize_tool_argument_string(
1525                        &call.function.name,
1526                        &call.function.arguments,
1527                    );
1528                }
1529            }
1530        }
1531
1532        // Qwen3 Fallback: When the model generates tool calls inside a <think> block,
1533        // LM Studio routes the entire thinking output (including <tool_call> XML) to
1534        // `reasoning_content` instead of `tool_calls`. If content is empty and we have
1535        // no tool calls yet, check reasoning_content for embedded tool call markup.
1536        let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1537        if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1538            && content
1539                .as_ref()
1540                .map(|s| s.trim().is_empty())
1541                .unwrap_or(true)
1542            && !reasoning_text.is_empty()
1543        {
1544            let recovered = extract_native_tool_calls(&reasoning_text);
1545            if !recovered.is_empty() {
1546                tool_calls = Some(recovered);
1547                // Clear content so downstream code doesn't see an empty string.
1548                content = None;
1549            } else if finish_reason.as_deref() == Some("stop") {
1550                // Qwen3.5 thinking-mode answer leak: the model wrote its final answer
1551                // inside reasoning_content and left content empty. finish_reason=stop
1552                // with no tool calls means the model intended this as its response.
1553                // Surface reasoning_content as the visible response rather than
1554                // firing the empty-response nudge loop.
1555                content = Some(reasoning_text);
1556            }
1557        }
1558
1559        Ok((content, tool_calls, body.usage, finish_reason))
1560    }
1561
1562    // ── Streaming call (used for plain-text responses) ────────────────────────
1563
1564    /// Stream a conversation (no tools). Emits Token/Done/Error events.
1565    pub async fn stream_messages(
1566        &self,
1567        messages: &[ChatMessage],
1568        tx: mpsc::Sender<InferenceEvent>,
1569    ) -> Result<(), Box<dyn std::error::Error>> {
1570        let current_model = self.current_model();
1571        let request_messages = if should_use_native_formatting(self, &current_model) {
1572            prepare_gemma_native_messages(messages)
1573        } else {
1574            messages
1575                .iter()
1576                .map(|m| {
1577                    let mut clone = m.clone();
1578                    let current_text = m.content.as_str();
1579                    if !current_text.starts_with("<|turn>") {
1580                        clone.content = MessageContent::Text(format!(
1581                            "<|turn>{}\n{}\n<turn|>",
1582                            m.role, current_text
1583                        ));
1584                    }
1585                    clone
1586                })
1587                .collect()
1588        };
1589
1590        let request = ChatRequest {
1591            model: current_model.clone(),
1592            messages: request_messages,
1593            temperature: 0.7,
1594            stream: true,
1595            tools: None,
1596        };
1597
1598        if let Err(e) = preflight_chat_request(
1599            &current_model,
1600            &request.messages,
1601            &[],
1602            self.current_context_length(),
1603        ) {
1604            let tag = classify_runtime_failure_tag(&e);
1605            let _ = tx
1606                .send(InferenceEvent::ProviderStatus {
1607                    state: provider_state_for_failure_tag(tag),
1608                    summary: compact_runtime_failure_summary(tag, &e),
1609                })
1610                .await;
1611            let _ = tx
1612                .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1613                .await;
1614            let _ = tx.send(InferenceEvent::Done).await;
1615            return Ok(());
1616        }
1617
1618        let mut last_err = String::new();
1619        let mut response_opt: Option<reqwest::Response> = None;
1620        for attempt in 0..2u32 {
1621            match self.client.post(&self.api_url).json(&request).send().await {
1622                Ok(res) if res.status().is_success() => {
1623                    response_opt = Some(res);
1624                    break;
1625                }
1626                Ok(res) if res.status().as_u16() >= 500 => {
1627                    last_err = format!("LM Studio error {}", res.status());
1628                }
1629                Ok(res) => {
1630                    let status = res.status();
1631                    let body = res.text().await.unwrap_or_default();
1632                    let preview = &body[..body.len().min(300)];
1633                    let detail = format!("LM Studio error {}: {}", status, preview);
1634                    let tag = classify_runtime_failure_tag(&detail);
1635                    let _ = tx
1636                        .send(InferenceEvent::ProviderStatus {
1637                            state: provider_state_for_failure_tag(tag),
1638                            summary: compact_runtime_failure_summary(tag, &detail),
1639                        })
1640                        .await;
1641                    let _ = tx
1642                        .send(InferenceEvent::Error(format_runtime_failure_message(
1643                            &detail,
1644                        )))
1645                        .await;
1646                    let _ = tx.send(InferenceEvent::Done).await;
1647                    return Ok(());
1648                }
1649                Err(e) if e.is_timeout() || e.is_connect() => {
1650                    last_err = format!("Request failed: {}", e);
1651                }
1652                Err(e) => {
1653                    let detail = format!("Request failed: {}", e);
1654                    let tag = classify_runtime_failure_tag(&detail);
1655                    let _ = tx
1656                        .send(InferenceEvent::ProviderStatus {
1657                            state: provider_state_for_failure_tag(tag),
1658                            summary: compact_runtime_failure_summary(tag, &detail),
1659                        })
1660                        .await;
1661                    let _ = tx
1662                        .send(InferenceEvent::Error(format_runtime_failure_message(
1663                            &detail,
1664                        )))
1665                        .await;
1666                    let _ = tx.send(InferenceEvent::Done).await;
1667                    return Ok(());
1668                }
1669            }
1670            if attempt < 1 {
1671                let _ = tx
1672                    .send(InferenceEvent::ProviderStatus {
1673                        state: ProviderRuntimeState::Recovering,
1674                        summary: "LM Studio degraded during stream startup; retrying once.".into(),
1675                    })
1676                    .await;
1677                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1678            }
1679        }
1680        let Some(res) = response_opt else {
1681            let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1682            let tag = classify_runtime_failure_tag(&detail);
1683            let _ = tx
1684                .send(InferenceEvent::ProviderStatus {
1685                    state: provider_state_for_failure_tag(tag),
1686                    summary: compact_runtime_failure_summary(tag, &detail),
1687                })
1688                .await;
1689            let _ = tx
1690                .send(InferenceEvent::Error(format_runtime_failure_message(
1691                    &detail,
1692                )))
1693                .await;
1694            let _ = tx.send(InferenceEvent::Done).await;
1695            return Ok(());
1696        };
1697
1698        use futures::StreamExt;
1699        let mut byte_stream = res.bytes_stream();
1700
1701        // [Collaborative Strategy] TokenBuffer refactor suggested by Hematite local agent.
1702        // Aggregates tokens to ensure coherent linguistic chunks for UI/Voice.
1703        let mut line_buffer = String::new();
1704        let mut content_buffer = String::new();
1705        let mut past_think = false;
1706        let mut emitted_any_content = false;
1707        let mut emitted_live_status = false;
1708
1709        // Immediate cancel gate: break *before* awaiting the stream
1710        // so Escape works even when LM Studio is silent between chunks.
1711        loop {
1712            let next = tokio::select! {
1713                // Race: next SSE chunk vs cancel poll
1714                chunk = byte_stream.next() => chunk,
1715                _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1716                    if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1717                        break;
1718                    }
1719                    continue;
1720                }
1721            };
1722
1723            let Some(item) = next else { break };
1724
1725            let chunk = match item {
1726                Ok(chunk) => chunk,
1727                Err(e) => {
1728                    let detail = format!("Request failed: {}", e);
1729                    let tag = classify_runtime_failure_tag(&detail);
1730                    let _ = tx
1731                        .send(InferenceEvent::ProviderStatus {
1732                            state: provider_state_for_failure_tag(tag),
1733                            summary: compact_runtime_failure_summary(tag, &detail),
1734                        })
1735                        .await;
1736                    let _ = tx
1737                        .send(InferenceEvent::Error(format_runtime_failure_message(
1738                            &detail,
1739                        )))
1740                        .await;
1741                    let _ = tx.send(InferenceEvent::Done).await;
1742                    return Ok(());
1743                }
1744            };
1745            line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1746
1747            while let Some(pos) = line_buffer.find("\n\n") {
1748                let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1749                let data_pos = match event_str.find("data: ") {
1750                    Some(p) => p,
1751                    None => continue,
1752                };
1753
1754                let data = event_str[data_pos + 6..].trim();
1755                if data == "[DONE]" {
1756                    break;
1757                }
1758
1759                if let Ok(json) = serde_json::from_str::<Value>(data) {
1760                    let delta = &json["choices"][0]["delta"];
1761
1762                    // Process reasoning/thought deltas (Qwen/O1 style)
1763                    if let Some(reasoning) = delta["reasoning_content"]
1764                        .as_str()
1765                        .or_else(|| delta["thought"].as_str())
1766                    {
1767                        if !reasoning.is_empty() {
1768                            past_think = false; // We are in reasoning mode
1769                            content_buffer.push_str(reasoning);
1770                            if content_buffer.len() > 30
1771                                && (reasoning.contains('\n') || reasoning.contains('.'))
1772                            {
1773                                let _ = tx
1774                                    .send(InferenceEvent::Thought(content_buffer.clone()))
1775                                    .await;
1776                                emitted_any_content = true;
1777                                content_buffer.clear();
1778                            }
1779                        }
1780                    }
1781
1782                    // Process standard content deltas
1783                    if let Some(content) = delta["content"].as_str() {
1784                        if content.is_empty() {
1785                            continue;
1786                        }
1787
1788                        // Auto-transition: if we have content but were in 'thinking' mode,
1789                        // and haven't seen an explicit tag, assume reasoning is over once content is non-empty.
1790                        if !past_think && !content_buffer.is_empty() && !content.trim().is_empty() {
1791                            // Only transition if the content isn't logically part of the thinking
1792                            // block (some models mix them). Standard heuristic: first non-whitespace content.
1793                            let _ = tx
1794                                .send(InferenceEvent::Thought(content_buffer.clone()))
1795                                .await;
1796                            content_buffer.clear();
1797                            past_think = true;
1798                        }
1799
1800                        if !past_think {
1801                            let lc = content.to_lowercase();
1802                            let close = lc
1803                                .find("<channel|>")
1804                                .map(|i| (i, "<channel|>".len()))
1805                                .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1806
1807                            if let Some((tag_start, tag_len)) = close {
1808                                // Flush any existing thought buffer
1809                                let before = &content[..tag_start];
1810                                content_buffer.push_str(before);
1811                                if !content_buffer.trim().is_empty() {
1812                                    let _ = tx
1813                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1814                                        .await;
1815                                    emitted_any_content = true;
1816                                }
1817                                content_buffer.clear();
1818
1819                                past_think = true;
1820                                let after = content[tag_start + tag_len..].trim_start_matches('\n');
1821                                content_buffer.push_str(after);
1822                            } else {
1823                                // Still in reasoning block
1824                                content_buffer.push_str(content);
1825                                if content_buffer.len() > 30
1826                                    && (content.contains('\n') || content.contains('.'))
1827                                {
1828                                    let _ = tx
1829                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1830                                        .await;
1831                                    emitted_any_content = true;
1832                                    content_buffer.clear();
1833                                }
1834                            }
1835                        } else {
1836                            // PAST THINK: final answer tokens.
1837                            content_buffer.push_str(content);
1838                            let is_boundary = content.contains(' ')
1839                                || content.contains('.')
1840                                || content.contains('!')
1841                                || content.contains('?');
1842
1843                            if content_buffer.len() > 10 && is_boundary {
1844                                if !emitted_live_status {
1845                                    let _ = tx
1846                                        .send(InferenceEvent::ProviderStatus {
1847                                            state: ProviderRuntimeState::Live,
1848                                            summary: String::new(),
1849                                        })
1850                                        .await;
1851                                    emitted_live_status = true;
1852                                }
1853                                let _ =
1854                                    tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1855                                emitted_any_content = true;
1856                                content_buffer.clear();
1857                            }
1858                        }
1859                    }
1860                }
1861            }
1862        }
1863
1864        // Final Flush
1865        if !content_buffer.is_empty() {
1866            if past_think {
1867                if !emitted_live_status {
1868                    let _ = tx
1869                        .send(InferenceEvent::ProviderStatus {
1870                            state: ProviderRuntimeState::Live,
1871                            summary: String::new(),
1872                        })
1873                        .await;
1874                }
1875                let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1876            } else {
1877                let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1878            }
1879            emitted_any_content = true;
1880        }
1881
1882        if !emitted_any_content {
1883            let _ = tx
1884                .send(InferenceEvent::ProviderStatus {
1885                    state: ProviderRuntimeState::EmptyResponse,
1886                    summary: compact_runtime_failure_summary(
1887                        "empty_model_response",
1888                        "Empty response from model",
1889                    ),
1890                })
1891                .await;
1892            let _ = tx
1893                .send(InferenceEvent::Error(format_runtime_failure_message(
1894                    "Empty response from model",
1895                )))
1896                .await;
1897            let _ = tx.send(InferenceEvent::Done).await;
1898            return Ok(());
1899        }
1900
1901        let _ = tx.send(InferenceEvent::Done).await;
1902        Ok(())
1903    }
1904
1905    /// Single-turn streaming (legacy helper used by startup sequence).
1906    pub async fn stream_generation(
1907        &self,
1908        prompt: &str,
1909        snark: u8,
1910        chaos: u8,
1911        brief: bool,
1912        professional: bool,
1913        tx: mpsc::Sender<InferenceEvent>,
1914    ) -> Result<(), Box<dyn std::error::Error>> {
1915        let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1916        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1917        self.stream_messages(&messages, tx).await
1918    }
1919
1920    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
1921
1922    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
1923    pub async fn generate_task_worker(
1924        &self,
1925        prompt: &str,
1926        professional: bool,
1927    ) -> Result<String, String> {
1928        let current_model = self.current_model();
1929        let model = self
1930            .worker_model
1931            .as_deref()
1932            .unwrap_or(current_model.as_str());
1933        self.generate_task_with_model(prompt, 0.1, professional, model)
1934            .await
1935    }
1936
1937    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1938        self.generate_task_with_temp(prompt, 0.1, professional)
1939            .await
1940    }
1941
1942    pub async fn generate_task_with_temp(
1943        &self,
1944        prompt: &str,
1945        temp: f32,
1946        professional: bool,
1947    ) -> Result<String, String> {
1948        let current_model = self.current_model();
1949        self.generate_task_with_model(prompt, temp, professional, &current_model)
1950            .await
1951    }
1952
1953    pub async fn generate_task_with_model(
1954        &self,
1955        prompt: &str,
1956        temp: f32,
1957        professional: bool,
1958        model: &str,
1959    ) -> Result<String, String> {
1960        let _permit = self
1961            .kv_semaphore
1962            .acquire()
1963            .await
1964            .map_err(|e| e.to_string())?;
1965
1966        let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1967        let request_messages = if should_use_native_formatting(self, model) {
1968            prepare_gemma_native_messages(&[
1969                ChatMessage::system(&system),
1970                ChatMessage::user(prompt),
1971            ])
1972        } else {
1973            vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1974        };
1975        let request = ChatRequest {
1976            model: model.to_string(),
1977            messages: request_messages,
1978            temperature: temp,
1979            stream: false,
1980            tools: None,
1981        };
1982
1983        preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1984
1985        let res = self
1986            .client
1987            .post(&self.api_url)
1988            .json(&request)
1989            .send()
1990            .await
1991            .map_err(|e| format!("LM Studio request failed: {}", e))?;
1992
1993        let body: ChatResponse = res
1994            .json()
1995            .await
1996            .map_err(|e| format!("Failed to parse response: {}", e))?;
1997
1998        body.choices
1999            .first()
2000            .and_then(|c| c.message.content.clone())
2001            .ok_or_else(|| "Empty response from model".to_string())
2002    }
2003
2004    // ── History management ────────────────────────────────────────────────────
2005
2006    /// Prune middle turns when context grows too large, keeping system + recent N.
2007    #[allow(dead_code)]
2008    pub fn snip_history(
2009        &self,
2010        turns: &[ChatMessage],
2011        max_tokens_estimate: usize,
2012        keep_recent: usize,
2013    ) -> Vec<ChatMessage> {
2014        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
2015        if total_chars / 4 <= max_tokens_estimate {
2016            return turns.to_vec();
2017        }
2018        let keep = keep_recent.min(turns.len());
2019        let mut snipped = vec![turns[0].clone()];
2020        if turns.len() > keep + 1 {
2021            snipped.push(ChatMessage::system(&format!(
2022                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2023                turns.len() - keep - 1
2024            )));
2025            snipped.extend_from_slice(&turns[turns.len() - keep..]);
2026        } else {
2027            snipped = turns.to_vec();
2028        }
2029        snipped
2030    }
2031}
2032
2033fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2034    serde_json::to_vec(value)
2035        .ok()
2036        .map_or(0, |bytes| bytes.len() / 4 + 1)
2037}
2038
2039const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2040
2041fn estimate_message_tokens(message: &ChatMessage) -> usize {
2042    let content_tokens = match &message.content {
2043        MessageContent::Text(s) => s.len() / 4 + 1,
2044        MessageContent::Parts(parts) => parts
2045            .iter()
2046            .map(|part| match part {
2047                ContentPart::Text { text } => text.len() / 4 + 1,
2048                // Image payloads are transported as data URLs, but their base64
2049                // length should not be treated like plain text context pressure.
2050                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2051            })
2052            .sum(),
2053    };
2054    let tool_tokens: usize = message
2055        .tool_calls
2056        .iter()
2057        .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2058        .sum();
2059    content_tokens + tool_tokens + 6
2060}
2061
2062pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2063    messages.iter().map(estimate_message_tokens).sum()
2064}
2065
2066fn reserved_output_tokens(context_length: usize) -> usize {
2067    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2068    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2069}
2070
2071pub fn estimate_prompt_pressure(
2072    messages: &[ChatMessage],
2073    tools: &[ToolDefinition],
2074    context_length: usize,
2075) -> (usize, usize, usize, u8) {
2076    let estimated_input_tokens =
2077        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2078    let reserved_output = reserved_output_tokens(context_length);
2079    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2080    let percent = if context_length == 0 {
2081        0
2082    } else {
2083        ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2084    };
2085    (
2086        estimated_input_tokens,
2087        reserved_output,
2088        estimated_total,
2089        percent,
2090    )
2091}
2092
2093fn preflight_chat_request(
2094    model: &str,
2095    messages: &[ChatMessage],
2096    tools: &[ToolDefinition],
2097    context_length: usize,
2098) -> Result<(), String> {
2099    let (estimated_input_tokens, reserved_output, estimated_total, _) =
2100        estimate_prompt_pressure(messages, tools, context_length);
2101
2102    if estimated_total > context_length {
2103        return Err(format!(
2104            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2105            model, estimated_input_tokens, reserved_output, estimated_total, context_length
2106        ));
2107    }
2108
2109    Ok(())
2110}
2111
2112/// Walk from CWD up to 4 parent directories and collect instruction files.
2113/// Looks for CLAUDE.md, CLAUDE.local.md, and .hematite/instructions.md.
2114/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
2115fn load_instruction_files() -> String {
2116    use std::collections::hash_map::DefaultHasher;
2117    use std::collections::HashSet;
2118    use std::hash::{Hash, Hasher};
2119
2120    let Ok(cwd) = std::env::current_dir() else {
2121        return String::new();
2122    };
2123    let mut result = String::new();
2124    let mut seen: HashSet<u64> = HashSet::new();
2125    let mut total_chars: usize = 0;
2126    const MAX_TOTAL: usize = 12_000;
2127    const MAX_PER_FILE: usize = 4_000;
2128
2129    let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2130
2131    let mut dir = cwd.clone();
2132    for _ in 0..4 {
2133        for name in &candidates {
2134            let path = dir.join(name);
2135            if !path.exists() {
2136                continue;
2137            }
2138            let Ok(content) = std::fs::read_to_string(&path) else {
2139                continue;
2140            };
2141            if content.trim().is_empty() {
2142                continue;
2143            }
2144
2145            let mut hasher = DefaultHasher::new();
2146            content.hash(&mut hasher);
2147            let h = hasher.finish();
2148            if !seen.insert(h) {
2149                continue;
2150            }
2151
2152            let truncated = if content.len() > MAX_PER_FILE {
2153                format!("{}...[truncated]", &content[..MAX_PER_FILE])
2154            } else {
2155                content
2156            };
2157
2158            if total_chars + truncated.len() > MAX_TOTAL {
2159                break;
2160            }
2161            total_chars += truncated.len();
2162            result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2163        }
2164        match dir.parent().map(|p| p.to_owned()) {
2165            Some(p) => dir = p,
2166            None => break,
2167        }
2168    }
2169
2170    if result.is_empty() {
2171        return String::new();
2172    }
2173    format!("\n\n# Project Instructions\n{}", result)
2174}
2175
2176pub fn extract_think_block(text: &str) -> Option<String> {
2177    let lower = text.to_lowercase();
2178
2179    // Official Gemma-4 Native Tags
2180    let open_tag = "<|channel>thought";
2181    let close_tag = "<channel|>";
2182
2183    let start_pos = lower.find(open_tag)?;
2184    let content_start = start_pos + open_tag.len();
2185
2186    let close_pos = lower[content_start..]
2187        .find(close_tag)
2188        .map(|p| content_start + p)
2189        .unwrap_or(text.len());
2190
2191    let content = text[content_start..close_pos].trim();
2192    if content.is_empty() {
2193        None
2194    } else {
2195        Some(content.to_string())
2196    }
2197}
2198
2199pub fn strip_think_blocks(text: &str) -> String {
2200    // Fast-path: strip a stray </think> the model emits at the start when it skips
2201    // the opening tag (common with Qwen after tool calls). Strip it before the lower
2202    // allocation so it can't slip through any branch below.
2203    let text = {
2204        let t = text.trim_start();
2205        if t.to_lowercase().starts_with("</think>") {
2206            &t[8..]
2207        } else {
2208            text
2209        }
2210    };
2211
2212    let lower = text.to_lowercase();
2213
2214    // Use the official Gemma-4 closing tag — answer is everything after it.
2215    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2216        let answer = text[end..]
2217            .replace("<|channel>thought", "")
2218            .replace("<channel|>", "");
2219        return answer.trim().replace("\n\n\n", "\n\n").to_string();
2220    }
2221
2222    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
2223    let first_open = [
2224        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
2225        lower.find("<think>"),
2226        lower.find("<thought>"),
2227        lower.find("<|think|>"),
2228    ]
2229    .iter()
2230    .filter_map(|&x| x)
2231    .min();
2232
2233    if let Some(start) = first_open {
2234        if start > 0 {
2235            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2236        }
2237        return String::new();
2238    }
2239
2240    // If the model outputs 'naked' reasoning without tags:
2241    // Strip leading sentences like "The user asked..." or "I should present..."
2242    // if they appear before actual answer content.
2243    let naked_reasoning_phrases: &[&str] = &[
2244        "the user asked",
2245        "the user is asking",
2246        "the user wants",
2247        "i will structure",
2248        "i should provide",
2249        "i should give",
2250        "i should avoid",
2251        "i should note",
2252        "i should focus",
2253        "i should keep",
2254        "i should respond",
2255        "i should present",
2256        "i should display",
2257        "i should show",
2258        "i need to",
2259        "i can see from",
2260        "without being overly",
2261        "let me ",
2262        "necessary information in my identity",
2263        "was computed successfully",
2264        "computed successfully",
2265    ];
2266    let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2267    if is_naked_reasoning {
2268        let lines: Vec<&str> = text.lines().collect();
2269        if !lines.is_empty() {
2270            // Skip leading lines that are themselves reasoning prose or blank.
2271            // Stop skipping at the first line that looks like real answer content.
2272            let mut start_idx = 0;
2273            for (i, line) in lines.iter().enumerate() {
2274                let l = line.to_lowercase();
2275                let is_reasoning_line =
2276                    naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2277                if is_reasoning_line {
2278                    start_idx = i + 1;
2279                } else {
2280                    break;
2281                }
2282            }
2283            if start_idx < lines.len() {
2284                return lines[start_idx..]
2285                    .join("\n")
2286                    .trim()
2287                    .replace("\n\n\n", "\n\n")
2288                    .to_string();
2289            }
2290            // Entire response was reasoning prose — return empty.
2291            return String::new();
2292        }
2293    }
2294
2295    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
2296    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
2297    let cleaned = strip_xml_tool_call_artifacts(text);
2298    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2299}
2300
2301/// Remove stray XML tool-call closing/opening tags that local models occasionally
2302/// leak into visible output when they start-then-abandon a tool call.
2303fn strip_xml_tool_call_artifacts(text: &str) -> String {
2304    // Tags to remove (both open and close forms, case-insensitive).
2305    const XML_ARTIFACTS: &[&str] = &[
2306        "</tool_call>",
2307        "<tool_call>",
2308        "</function>",
2309        "<function>",
2310        "</parameter>",
2311        "<parameter>",
2312        "</arguments>",
2313        "<arguments>",
2314        "</tool_use>",
2315        "<tool_use>",
2316        "</invoke>",
2317        "<invoke>",
2318        // Stray think/reasoning closing tags that leak after block extraction.
2319        "</think>",
2320        "</thought>",
2321        "</thinking>",
2322    ];
2323    let mut out = text.to_string();
2324    for tag in XML_ARTIFACTS {
2325        // Case-insensitive replace
2326        while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2327            out.drain(pos..pos + tag.len());
2328        }
2329    }
2330    // Collapse any blank lines left behind
2331    out
2332}
2333
2334/// Extract native Gemma-4 <|tool_call|> tags from text.
2335/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
2336pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2337    use regex::Regex;
2338    let mut results = Vec::new();
2339
2340    // -- Format 1: Gemma 4 Native (call:name{args}) --
2341    let re_call = Regex::new(
2342        r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2343    ).unwrap();
2344    let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2345
2346    for cap in re_call.captures_iter(text) {
2347        let name = cap[1].to_string();
2348        let args_str = &cap[2];
2349        let mut arguments = serde_json::Map::new();
2350
2351        for arg_cap in re_arg.captures_iter(args_str) {
2352            let key = arg_cap[1].to_string();
2353            let val_raw = arg_cap
2354                .get(2)
2355                .map(|m| m.as_str())
2356                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2357                .unwrap_or("")
2358                .trim();
2359            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2360
2361            let val = if normalized_raw == "true" {
2362                Value::Bool(true)
2363            } else if normalized_raw == "false" {
2364                Value::Bool(false)
2365            } else if let Ok(n) = normalized_raw.parse::<i64>() {
2366                Value::Number(n.into())
2367            } else if let Ok(n) = normalized_raw.parse::<u64>() {
2368                Value::Number(n.into())
2369            } else if let Ok(n) = normalized_raw.parse::<f64>() {
2370                serde_json::Number::from_f64(n)
2371                    .map(Value::Number)
2372                    .unwrap_or(Value::String(normalized_raw.clone()))
2373            } else {
2374                Value::String(normalized_raw)
2375            };
2376
2377            arguments.insert(key, val);
2378        }
2379
2380        results.push(ToolCallResponse {
2381            id: format!("call_{}", rand::random::<u32>()),
2382            call_type: "function".to_string(),
2383            function: ToolCallFn {
2384                name,
2385                arguments: Value::Object(arguments).to_string(),
2386            },
2387        });
2388    }
2389
2390    // -- Format 2: XML (Qwen/Claude style) --
2391    let re_xml_call = Regex::new(
2392        r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
2393    ).unwrap();
2394    let re_xml_param =
2395        Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
2396
2397    for cap in re_xml_call.captures_iter(text) {
2398        let name = cap[1].to_string();
2399        let body = &cap[2];
2400        let mut arguments = serde_json::Map::new();
2401
2402        for p_cap in re_xml_param.captures_iter(body) {
2403            let key = p_cap[1].to_string();
2404            let val_raw = p_cap[2].trim();
2405            let val = if val_raw == "true" {
2406                Value::Bool(true)
2407            } else if val_raw == "false" {
2408                Value::Bool(false)
2409            } else if let Ok(n) = val_raw.parse::<i64>() {
2410                Value::Number(n.into())
2411            } else if let Ok(n) = val_raw.parse::<u64>() {
2412                Value::Number(n.into())
2413            } else {
2414                Value::String(val_raw.to_string())
2415            };
2416            arguments.insert(key, val);
2417        }
2418
2419        results.push(ToolCallResponse {
2420            id: format!("call_{}", rand::random::<u32>()),
2421            call_type: "function".to_string(),
2422            function: ToolCallFn {
2423                name,
2424                arguments: Value::Object(arguments).to_string(),
2425            },
2426        });
2427    }
2428
2429    results
2430}
2431
2432pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2433    let trimmed = raw.trim();
2434    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2435
2436    let mut value = match serde_json::from_str::<Value>(&candidate) {
2437        Ok(v) => v,
2438        Err(_) => return candidate,
2439    };
2440    normalize_tool_argument_value(tool_name, &mut value);
2441    value.to_string()
2442}
2443
2444fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2445    match value {
2446        Value::String(s) => *s = normalize_string_arg(s),
2447        Value::Array(items) => {
2448            for item in items {
2449                normalize_tool_argument_value(tool_name, item);
2450            }
2451        }
2452        Value::Object(map) => {
2453            for val in map.values_mut() {
2454                normalize_tool_argument_value(tool_name, val);
2455            }
2456            if tool_name == "grep_files" {
2457                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2458                    *pattern = normalize_regex_pattern(pattern);
2459                }
2460            }
2461            for key in ["path", "extension", "query", "command", "reason"] {
2462                if let Some(Value::String(s)) = map.get_mut(key) {
2463                    *s = normalize_string_arg(s);
2464                }
2465            }
2466        }
2467        _ => {}
2468    }
2469}
2470
2471fn unwrap_json_string_once(input: &str) -> Option<String> {
2472    if input.len() < 2 {
2473        return None;
2474    }
2475    let first = input.chars().next()?;
2476    let last = input.chars().last()?;
2477    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2478        return None;
2479    }
2480    let inner = &input[1..input.len() - 1];
2481    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2482    Some(unescaped.trim().to_string())
2483}
2484
2485fn normalize_string_arg(input: &str) -> String {
2486    let mut out = input.trim().to_string();
2487    while out.len() >= 2 {
2488        let mut changed = false;
2489        for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2490            if out.starts_with(start) && out.ends_with(end) {
2491                out = out[start.len()..out.len() - end.len()].trim().to_string();
2492                changed = true;
2493                break;
2494            }
2495        }
2496        if !changed {
2497            break;
2498        }
2499    }
2500    out
2501}
2502
2503fn normalize_regex_pattern(input: &str) -> String {
2504    let out = normalize_string_arg(input);
2505    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2506        out[1..out.len() - 1].to_string()
2507    } else {
2508        out
2509    }
2510}
2511
2512fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2513    let mut system_blocks = Vec::new();
2514    let mut prepared = Vec::new();
2515    let mut seeded = false;
2516
2517    for message in messages {
2518        if message.role == "system" {
2519            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2520                .trim()
2521                .to_string();
2522            if !cleaned.is_empty() {
2523                system_blocks.push(cleaned);
2524            }
2525            continue;
2526        }
2527
2528        let mut clone = message.clone();
2529        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2530
2531        if !seeded && message.role == "user" {
2532            let mut merged = String::new();
2533            if !system_blocks.is_empty() {
2534                merged.push_str("System instructions for this turn:\n");
2535                merged.push_str(&system_blocks.join("\n\n"));
2536                merged.push_str("\n\n");
2537            }
2538            merged.push_str(clone.content.as_str());
2539            clone.content = MessageContent::Text(merged);
2540            seeded = true;
2541        }
2542
2543        prepared.push(clone);
2544    }
2545
2546    if !seeded && !system_blocks.is_empty() {
2547        prepared.insert(
2548            0,
2549            ChatMessage::user(&format!(
2550                "System instructions for this turn:\n{}",
2551                system_blocks.join("\n\n")
2552            )),
2553        );
2554    }
2555
2556    prepared
2557}
2558
2559fn strip_legacy_turn_wrappers(text: &str) -> String {
2560    text.replace("<|turn>system\n", "")
2561        .replace("<|turn>user\n", "")
2562        .replace("<|turn>assistant\n", "")
2563        .replace("<|turn>tool\n", "")
2564        .replace("<turn|>", "")
2565        .trim()
2566        .to_string()
2567}
2568
2569pub fn strip_native_tool_call_text(text: &str) -> String {
2570    use regex::Regex;
2571    // Format 1: Gemma 4 Native
2572    let re_call = Regex::new(
2573        r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2574    ).unwrap();
2575    // Format 2: XML (Qwen/Claude style)
2576    let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
2577    let re_response =
2578        Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2579            .unwrap();
2580    let without_calls = re_call.replace_all(text, "");
2581    let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
2582    re_response
2583        .replace_all(without_xml.as_ref(), "")
2584        .trim()
2585        .to_string()
2586}
2587
2588#[cfg(test)]
2589mod tests {
2590    use super::*;
2591
2592    #[test]
2593    fn system_prompt_includes_running_hematite_version() {
2594        let engine = InferenceEngine::new(
2595            "http://localhost:1234/v1".to_string(),
2596            "strategist".to_string(),
2597            0,
2598        )
2599        .expect("engine");
2600
2601        let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2602        assert!(system.contains(crate::HEMATITE_VERSION));
2603    }
2604
2605    #[test]
2606    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2607        let text = r#"<|channel>thought
2608Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2609<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2610
2611        let calls = extract_native_tool_calls(text);
2612        assert_eq!(calls.len(), 1);
2613        assert_eq!(calls[0].function.name, "read_file");
2614
2615        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2616        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2617        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2618        assert_eq!(
2619            args.get("path").and_then(|v| v.as_str()),
2620            Some("src/ui/tui.rs")
2621        );
2622
2623        let stripped = strip_native_tool_call_text(text);
2624        assert!(!stripped.contains("<|tool_call"));
2625        assert!(!stripped.contains("<tool_call|>"));
2626    }
2627
2628    #[test]
2629    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2630        let text = r#"<|channel>thought
2631Planning.
2632<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2633Mapped src.
2634<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2635Read main.
2636<channel|>"#;
2637
2638        let calls = extract_native_tool_calls(text);
2639        assert_eq!(calls.len(), 2);
2640        assert_eq!(calls[0].function.name, "list_files");
2641        assert_eq!(calls[1].function.name, "read_file");
2642
2643        let stripped = strip_native_tool_call_text(text);
2644        assert!(!stripped.contains("<|tool_call"));
2645        assert!(!stripped.contains("<|tool_response"));
2646        assert!(!stripped.contains("<tool_response|>"));
2647    }
2648
2649    #[test]
2650    fn extracts_qwen_xml_tool_calls_from_reasoning() {
2651        let text = r#"Based on the project structure, I need to check the binary.
2652<tool_call>
2653<function=shell>
2654<parameter=command>
2655ls -la hematite.exe
2656</parameter>
2657<parameter=reason>
2658Check if the binary exists
2659</parameter>
2660</function>
2661</tool_call>"#;
2662
2663        let calls = extract_native_tool_calls(text);
2664        assert_eq!(calls.len(), 1);
2665        assert_eq!(calls[0].function.name, "shell");
2666
2667        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2668        assert_eq!(
2669            args.get("command").and_then(|v| v.as_str()),
2670            Some("ls -la hematite.exe")
2671        );
2672        assert_eq!(
2673            args.get("reason").and_then(|v| v.as_str()),
2674            Some("Check if the binary exists")
2675        );
2676
2677        let stripped = strip_native_tool_call_text(text);
2678        assert!(!stripped.contains("<tool_call>"));
2679        assert!(!stripped.contains("<function=shell>"));
2680    }
2681}
hematite/agent/inference.rs

hematite/agent/
inference.rs