hematite/agent/
inference.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7// ── Engine ────────────────────────────────────────────────────────────────────
8
9pub struct InferenceEngine {
10    pub client: reqwest::Client,
11    pub api_url: String,
12    /// Root URL of the LLM provider (e.g. `http://localhost:1234`).
13    /// All non-completions endpoints (models list, health, embeddings) are derived from this.
14    pub base_url: String,
15    pub species: String,
16    pub snark: u8,
17    pub kv_semaphore: Semaphore,
18    /// The model ID currently loaded in LM Studio (auto-detected on boot).
19    pub model: std::sync::RwLock<String>,
20    /// Context window length in tokens (auto-detected from LM Studio, default 32768).
21    pub context_length: std::sync::atomic::AtomicUsize,
22    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23    /// Optional model ID for worker-level tasks (Swarms / research).
24    pub worker_model: Option<String>,
25    /// Opt-in Gemma-native request shaping. Off by default.
26    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27    /// Global cancellation token for hard-interrupting the inference stream.
28    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32    let lower = model.to_ascii_lowercase();
33    lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37    is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40// ── OpenAI Tool Definition ────────────────────────────────────────────────────
41
42#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44    #[serde(rename = "type")]
45    pub tool_type: String,
46    pub function: ToolFunction,
47    #[serde(skip_serializing, skip_deserializing)]
48    pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53    pub name: String,
54    pub description: String,
55    pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60    RepoRead,
61    RepoWrite,
62    Runtime,
63    Architecture,
64    Toolchain,
65    Verification,
66    Git,
67    Research,
68    Vision,
69    Lsp,
70    Workflow,
71    External,
72    Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77    pub category: ToolCategory,
78    pub mutates_workspace: bool,
79    pub external_surface: bool,
80    pub trust_sensitive: bool,
81    pub read_only_friendly: bool,
82    pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86    if name.starts_with("mcp__") {
87        let lower = name.to_ascii_lowercase();
88        let mutates_workspace = [
89            "__edit",
90            "__write",
91            "__create",
92            "__move",
93            "__delete",
94            "__remove",
95            "__rename",
96            "__replace",
97            "__patch",
98        ]
99        .iter()
100        .any(|needle| lower.contains(needle));
101        return ToolMetadata {
102            category: ToolCategory::External,
103            mutates_workspace,
104            external_surface: true,
105            trust_sensitive: true,
106            read_only_friendly: !mutates_workspace,
107            plan_scope: false,
108        };
109    }
110
111    match name {
112        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113            category: ToolCategory::RepoRead,
114            mutates_workspace: false,
115            external_surface: false,
116            trust_sensitive: false,
117            read_only_friendly: true,
118            plan_scope: true,
119        },
120        "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121            category: ToolCategory::RepoWrite,
122            mutates_workspace: true,
123            external_surface: false,
124            trust_sensitive: true,
125            read_only_friendly: false,
126            plan_scope: true,
127        },
128        "trace_runtime_flow" => ToolMetadata {
129            category: ToolCategory::Architecture,
130            mutates_workspace: false,
131            external_surface: false,
132            trust_sensitive: false,
133            read_only_friendly: true,
134            plan_scope: false,
135        },
136        "describe_toolchain" => ToolMetadata {
137            category: ToolCategory::Toolchain,
138            mutates_workspace: false,
139            external_surface: false,
140            trust_sensitive: false,
141            read_only_friendly: true,
142            plan_scope: false,
143        },
144        "shell" => ToolMetadata {
145            category: ToolCategory::Runtime,
146            mutates_workspace: true,
147            external_surface: false,
148            trust_sensitive: true,
149            read_only_friendly: false,
150            plan_scope: false,
151        },
152        "inspect_host" => ToolMetadata {
153            category: ToolCategory::Runtime,
154            mutates_workspace: false,
155            external_surface: false,
156            trust_sensitive: false,
157            read_only_friendly: true,
158            plan_scope: false,
159        },
160        "run_hematite_maintainer_workflow" => ToolMetadata {
161            category: ToolCategory::Workflow,
162            mutates_workspace: true,
163            external_surface: false,
164            trust_sensitive: true,
165            read_only_friendly: false,
166            plan_scope: false,
167        },
168        "run_workspace_workflow" => ToolMetadata {
169            category: ToolCategory::Workflow,
170            mutates_workspace: true,
171            external_surface: false,
172            trust_sensitive: true,
173            read_only_friendly: false,
174            plan_scope: false,
175        },
176        "verify_build" => ToolMetadata {
177            category: ToolCategory::Verification,
178            mutates_workspace: false,
179            external_surface: false,
180            trust_sensitive: false,
181            read_only_friendly: true,
182            plan_scope: false,
183        },
184        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
185            ToolMetadata {
186                category: ToolCategory::Git,
187                mutates_workspace: true,
188                external_surface: false,
189                trust_sensitive: true,
190                read_only_friendly: false,
191                plan_scope: false,
192            }
193        }
194        "research_web" | "fetch_docs" => ToolMetadata {
195            category: ToolCategory::Research,
196            mutates_workspace: false,
197            external_surface: false,
198            trust_sensitive: false,
199            read_only_friendly: true,
200            plan_scope: false,
201        },
202        "vision_analyze" => ToolMetadata {
203            category: ToolCategory::Vision,
204            mutates_workspace: false,
205            external_surface: false,
206            trust_sensitive: false,
207            read_only_friendly: true,
208            plan_scope: false,
209        },
210        "lsp_definitions"
211        | "lsp_references"
212        | "lsp_hover"
213        | "lsp_rename_symbol"
214        | "lsp_get_diagnostics"
215        | "lsp_search_symbol" => ToolMetadata {
216            category: ToolCategory::Lsp,
217            mutates_workspace: false,
218            external_surface: false,
219            trust_sensitive: false,
220            read_only_friendly: true,
221            plan_scope: false,
222        },
223        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
224            category: ToolCategory::Workflow,
225            mutates_workspace: false,
226            external_surface: false,
227            trust_sensitive: false,
228            read_only_friendly: true,
229            plan_scope: true,
230        },
231        "manage_tasks" => ToolMetadata {
232            category: ToolCategory::Workflow,
233            mutates_workspace: false,
234            external_surface: false,
235            trust_sensitive: false,
236            read_only_friendly: true,
237            plan_scope: false,
238        },
239        _ => ToolMetadata {
240            category: ToolCategory::Other,
241            mutates_workspace: false,
242            external_surface: false,
243            trust_sensitive: false,
244            read_only_friendly: true,
245            plan_scope: false,
246        },
247    }
248}
249
250// ── Message types ─────────────────────────────────────────────────────────────
251
252/// OpenAI-compatible chat message. Content can be a string (legacy) or a
253/// Vec of ContentPart (multimodal).
254#[derive(Serialize, Deserialize, Clone, Debug)]
255pub struct ChatMessage {
256    pub role: String,
257    /// Support both simple string content and complex multi-part content (Vision).
258    pub content: MessageContent,
259    /// Assistant messages may have tool calls. Default to empty vec, not null.
260    #[serde(default, skip_serializing_if = "Vec::is_empty")]
261    pub tool_calls: Vec<ToolCallResponse>,
262    /// Tool message references the original call.
263    #[serde(skip_serializing_if = "Option::is_none")]
264    pub tool_call_id: Option<String>,
265    /// Tool message name.
266    #[serde(skip_serializing_if = "Option::is_none")]
267    pub name: Option<String>,
268}
269
270#[derive(Serialize, Deserialize, Clone, Debug)]
271#[serde(untagged)]
272pub enum MessageContent {
273    Text(String),
274    Parts(Vec<ContentPart>),
275}
276
277#[derive(Serialize, Deserialize, Clone, Debug)]
278#[serde(tag = "type")]
279pub enum ContentPart {
280    #[serde(rename = "text")]
281    Text { text: String },
282    #[serde(rename = "image_url")]
283    ImageUrl { image_url: ImageUrlSource },
284}
285
286#[derive(Serialize, Deserialize, Clone, Debug)]
287pub struct ImageUrlSource {
288    pub url: String,
289}
290
291impl Default for MessageContent {
292    fn default() -> Self {
293        MessageContent::Text(String::new())
294    }
295}
296
297impl MessageContent {
298    pub fn as_str(&self) -> &str {
299        match self {
300            MessageContent::Text(s) => s,
301            MessageContent::Parts(parts) => {
302                for part in parts {
303                    if let ContentPart::Text { text } = part {
304                        return text;
305                    }
306                }
307                ""
308            }
309        }
310    }
311}
312
313impl ChatMessage {
314    pub fn system(content: &str) -> Self {
315        Self {
316            role: "system".into(),
317            content: MessageContent::Text(content.into()),
318            tool_calls: Vec::new(),
319            tool_call_id: None,
320            name: None,
321        }
322    }
323    pub fn user(content: &str) -> Self {
324        Self {
325            role: "user".into(),
326            content: MessageContent::Text(content.into()),
327            tool_calls: Vec::new(),
328            tool_call_id: None,
329            name: None,
330        }
331    }
332    pub fn user_with_image(text: &str, image_url: &str) -> Self {
333        let mut text_parts = text.to_string();
334        if !text_parts.contains("<|image|>") {
335            text_parts.push_str(" <|image|>");
336        }
337        Self {
338            role: "user".into(),
339            content: MessageContent::Parts(vec![
340                ContentPart::Text { text: text_parts },
341                ContentPart::ImageUrl {
342                    image_url: ImageUrlSource {
343                        url: image_url.into(),
344                    },
345                },
346            ]),
347            tool_calls: Vec::new(),
348            tool_call_id: None,
349            name: None,
350        }
351    }
352    pub fn assistant_text(content: &str) -> Self {
353        Self {
354            role: "assistant".into(),
355            content: MessageContent::Text(content.into()),
356            tool_calls: Vec::new(),
357            tool_call_id: None,
358            name: None,
359        }
360    }
361    pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
362        Self {
363            role: "assistant".into(),
364            content: MessageContent::Text(content.into()),
365            tool_calls: calls,
366            tool_call_id: None,
367            name: None,
368        }
369    }
370    pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
371        Self::tool_result_for_model(tool_call_id, fn_name, content, "")
372    }
373
374    /// Build a tool result message, applying Gemma 4 native markup only when the
375    /// loaded model is actually a Gemma 4 model.
376    pub fn tool_result_for_model(
377        tool_call_id: &str,
378        fn_name: &str,
379        content: &str,
380        model: &str,
381    ) -> Self {
382        let body = if is_gemma4_model_name(model) {
383            format!(
384                "<|tool_response>response:{}{}{}<tool_response|>",
385                fn_name, "{", content
386            )
387        } else {
388            content.to_string()
389        };
390        Self {
391            role: "tool".into(),
392            content: MessageContent::Text(body),
393            tool_calls: Vec::new(),
394            tool_call_id: Some(tool_call_id.into()),
395            name: Some(fn_name.into()),
396        }
397    }
398}
399
400// ── Tool call as returned by the model ───────────────────────────────────────
401
402#[derive(Serialize, Deserialize, Clone, Debug)]
403pub struct ToolCallResponse {
404    pub id: String,
405    #[serde(rename = "type")]
406    pub call_type: String,
407    pub function: ToolCallFn,
408}
409
410#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallFn {
412    pub name: String,
413    /// JSON-encoded arguments string (as returned by the API).
414    pub arguments: String,
415}
416
417// ── HTTP request / response shapes ───────────────────────────────────────────
418
419#[derive(Serialize)]
420struct ChatRequest {
421    model: String,
422    messages: Vec<ChatMessage>,
423    temperature: f32,
424    stream: bool,
425    #[serde(skip_serializing_if = "Option::is_none")]
426    tools: Option<Vec<ToolDefinition>>,
427}
428
429#[derive(Deserialize, Debug)]
430struct ChatResponse {
431    choices: Vec<ResponseChoice>,
432    usage: Option<TokenUsage>,
433}
434
435#[derive(Deserialize, Debug, Clone)]
436pub struct TokenUsage {
437    pub prompt_tokens: usize,
438    pub completion_tokens: usize,
439    pub total_tokens: usize,
440    #[serde(default)]
441    pub prompt_cache_hit_tokens: usize,
442    #[serde(default)]
443    pub cache_read_input_tokens: usize,
444}
445
446#[derive(Deserialize, Debug)]
447struct ResponseChoice {
448    message: ResponseMessage,
449    #[serde(default)]
450    finish_reason: Option<String>,
451}
452
453#[derive(Deserialize, Debug)]
454struct ResponseMessage {
455    content: Option<String>,
456    tool_calls: Option<Vec<ToolCallResponse>>,
457    /// LM Studio routes Qwen3 thinking-mode output here instead of wrapping
458    /// it in <think> tags inside `content`. When tool calls are generated
459    /// inside a think block, they end up here rather than in `tool_calls`.
460    #[serde(default)]
461    reasoning_content: Option<String>,
462}
463
464const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
465const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
466
467fn is_tiny_context_window(context_length: usize) -> bool {
468    context_length <= 8_192
469}
470
471fn is_compact_context_window(context_length: usize) -> bool {
472    context_length > 8_192 && context_length <= 49_152
473}
474
475pub fn is_compact_context_window_pub(context_length: usize) -> bool {
476    is_compact_context_window(context_length)
477}
478
479fn is_provider_context_limit_detail(lower: &str) -> bool {
480    (lower.contains("n_keep") && lower.contains("n_ctx"))
481        || lower.contains("context length")
482        || lower.contains("keep from the initial prompt")
483        || lower.contains("prompt is greater than the context length")
484        || lower.contains("exceeds the context window")
485}
486
487fn classify_runtime_failure_tag(detail: &str) -> &'static str {
488    let lower = detail.to_ascii_lowercase();
489    if lower.contains("context_window_blocked")
490        || lower.contains("context ceiling reached")
491        || lower.contains("exceeds the")
492        || is_provider_context_limit_detail(&lower)
493    {
494        "context_window"
495    } else if lower.contains("empty response from model")
496        || lower.contains("model returned an empty response")
497    {
498        "empty_model_response"
499    } else if lower.contains("action blocked:")
500        || lower.contains("access denied")
501        || lower.contains("declined by user")
502    {
503        "tool_policy_blocked"
504    } else {
505        "provider_degraded"
506    }
507}
508
509fn runtime_failure_guidance(tag: &str) -> &'static str {
510    match tag {
511        "context_window" => {
512            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
513        }
514        "empty_model_response" => {
515            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
516        }
517        "tool_policy_blocked" => {
518            "Stay inside the allowed workflow or switch modes before retrying."
519        }
520        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
521    }
522}
523
524fn format_runtime_failure_message(detail: &str) -> String {
525    let tag = classify_runtime_failure_tag(detail);
526    format!(
527        "[failure:{}] {} Detail: {}",
528        tag,
529        runtime_failure_guidance(tag),
530        detail.trim()
531    )
532}
533
534#[derive(Debug, Clone, Copy, PartialEq, Eq)]
535pub enum ProviderRuntimeState {
536    Booting,
537    Live,
538    Recovering,
539    Degraded,
540    ContextWindow,
541    EmptyResponse,
542}
543
544#[derive(Debug, Clone, Copy, PartialEq, Eq)]
545pub enum McpRuntimeState {
546    Unconfigured,
547    Healthy,
548    Degraded,
549    Failed,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum OperatorCheckpointState {
554    Idle,
555    RecoveringProvider,
556    BudgetReduced,
557    HistoryCompacted,
558    BlockedContextWindow,
559    BlockedPolicy,
560    BlockedRecentFileEvidence,
561    BlockedExactLineWindow,
562    BlockedToolLoop,
563    BlockedVerification,
564}
565
566impl OperatorCheckpointState {
567    pub fn label(self) -> &'static str {
568        match self {
569            OperatorCheckpointState::Idle => "idle",
570            OperatorCheckpointState::RecoveringProvider => "recovering_provider",
571            OperatorCheckpointState::BudgetReduced => "budget_reduced",
572            OperatorCheckpointState::HistoryCompacted => "history_compacted",
573            OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
574            OperatorCheckpointState::BlockedPolicy => "blocked_policy",
575            OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
576            OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
577            OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
578            OperatorCheckpointState::BlockedVerification => "blocked_verification",
579        }
580    }
581}
582
583fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
584    match tag {
585        "context_window" => ProviderRuntimeState::ContextWindow,
586        "empty_model_response" => ProviderRuntimeState::EmptyResponse,
587        _ => ProviderRuntimeState::Degraded,
588    }
589}
590
591fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
592    match tag {
593        "context_window" => {
594            "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
595                .to_string()
596        }
597        "empty_model_response" => {
598            "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
599                .to_string()
600        }
601        "tool_policy_blocked" => {
602            "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
603                .to_string()
604        }
605        _ => {
606            let mut excerpt = detail
607                .split_whitespace()
608                .take(12)
609                .collect::<Vec<_>>()
610                .join(" ");
611            if excerpt.len() > 110 {
612                excerpt.truncate(110);
613                excerpt.push_str("...");
614            }
615            if excerpt.is_empty() {
616                "LM Studio degraded; Hematite will retry once before surfacing a failure."
617                    .to_string()
618            } else {
619                format!("LM Studio degraded: {}", excerpt)
620            }
621        }
622    }
623}
624
625// ── Events pushed to the TUI ──────────────────────────────────────────────────
626
627#[derive(Debug)]
628pub enum InferenceEvent {
629    /// A text token to append to the current assistant message.
630    Token(String),
631    /// A text token to be displayed on screen but NOT spoken (e.g. startup greeting).
632    MutedToken(String),
633    /// Internal model reasoning (shown in side panel, not dialogue).
634    Thought(String),
635    /// Critical diagnostic feedback from the voice synthesis engine.
636    VoiceStatus(String),
637    /// A tool call is starting – show a status line in the TUI.
638    ToolCallStart {
639        id: String,
640        name: String,
641        args: String,
642    },
643    /// A tool call completed – show result in the TUI.
644    ToolCallResult {
645        id: String,
646        name: String,
647        output: String,
648        is_error: bool,
649    },
650    /// A risky tool requires explicit user approval.
651    /// The TUI must send `true` (approved) or `false` (rejected) via `responder`.
652    /// When `diff` is Some, the modal renders a coloured before/after diff preview.
653    ApprovalRequired {
654        id: String,
655        name: String,
656        display: String,
657        /// Pre-formatted diff: lines starting with "- " are removals, "+ " are additions,
658        /// "---" is a file header.  None means a plain high-risk approval (no diff).
659        diff: Option<String>,
660        responder: tokio::sync::oneshot::Sender<bool>,
661    },
662    /// The current agent turn is complete.
663    Done,
664    /// An error occurred during inference.
665    Error(String),
666    /// Compact provider/runtime state for the operator surface.
667    ProviderStatus {
668        state: ProviderRuntimeState,
669        summary: String,
670    },
671    /// Typed operator checkpoint/blocker state for SPECULAR and recovery UIs.
672    OperatorCheckpoint {
673        state: OperatorCheckpointState,
674        summary: String,
675    },
676    /// Typed recovery recipe summary for operator/debug surfaces.
677    RecoveryRecipe { summary: String },
678    /// Compact MCP/runtime server health for the operator surface.
679    McpStatus {
680        state: McpRuntimeState,
681        summary: String,
682    },
683    /// Current compaction pressure against the adaptive threshold.
684    CompactionPressure {
685        estimated_tokens: usize,
686        threshold_tokens: usize,
687        percent: u8,
688    },
689    /// Current total prompt-budget pressure against the live context window.
690    PromptPressure {
691        estimated_input_tokens: usize,
692        reserved_output_tokens: usize,
693        estimated_total_tokens: usize,
694        context_length: usize,
695        percent: u8,
696    },
697    /// A generic task progress update (e.g. for single-agent tool execution).
698    TaskProgress {
699        id: String,
700        label: String,
701        progress: u8,
702    },
703    /// Real-time token usage update from the API.
704    UsageUpdate(TokenUsage),
705    /// The current runtime profile detected from LM Studio.
706    RuntimeProfile {
707        model_id: String,
708        context_length: usize,
709    },
710    /// Vein index status after each incremental re-index.
711    VeinStatus {
712        file_count: usize,
713        embedded_count: usize,
714        docs_only: bool,
715    },
716    /// File paths the Vein surfaced as relevant to the current turn.
717    /// Used to populate ACTIVE CONTEXT with retrieval results.
718    VeinContext { paths: Vec<String> },
719    /// A new companion was hatched mid-session via /reroll.
720    SoulReroll {
721        species: String,
722        rarity: String,
723        shiny: bool,
724        personality: String,
725    },
726    /// Embed model loaded/unloaded mid-session.
727    EmbedProfile { model_id: Option<String> },
728}
729
730// ── Engine implementation ─────────────────────────────────────────────────────
731
732impl InferenceEngine {
733    pub fn new(
734        api_url: String,
735        species: String,
736        snark: u8,
737    ) -> Result<Self, Box<dyn std::error::Error>> {
738        let client = reqwest::Client::builder()
739            .timeout(std::time::Duration::from_secs(180))
740            .build()?;
741
742        // Extract http://host:port as the base for all non-completions endpoints.
743        let base_url = {
744            let trimmed = api_url.trim_end_matches('/');
745            if let Some(scheme_end) = trimmed.find("://") {
746                let after_scheme = &trimmed[scheme_end + 3..];
747                if let Some(path_start) = after_scheme.find('/') {
748                    format!(
749                        "{}://{}",
750                        &trimmed[..scheme_end],
751                        &after_scheme[..path_start]
752                    )
753                } else {
754                    trimmed.to_string()
755                }
756            } else {
757                trimmed.to_string()
758            }
759        };
760
761        let api_url = if api_url.ends_with("/chat/completions") {
762            api_url
763        } else if api_url.ends_with("/") {
764            format!("{}chat/completions", api_url)
765        } else {
766            format!("{}/chat/completions", api_url)
767        };
768
769        Ok(Self {
770            client,
771            api_url,
772            base_url,
773            species,
774            snark,
775            kv_semaphore: Semaphore::new(3),
776            model: std::sync::RwLock::new(String::new()),
777            context_length: std::sync::atomic::AtomicUsize::new(32_768), // Gemma-4 Sweet Spot (32K)
778            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
779            worker_model: None,
780            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
781            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
782        })
783    }
784
785    pub fn set_gemma_native_formatting(&self, enabled: bool) {
786        self.gemma_native_formatting
787            .store(enabled, std::sync::atomic::Ordering::SeqCst);
788    }
789
790    pub fn gemma_native_formatting_enabled(&self) -> bool {
791        self.gemma_native_formatting
792            .load(std::sync::atomic::Ordering::SeqCst)
793    }
794
795    pub fn current_model(&self) -> String {
796        self.model.read().map(|g| g.clone()).unwrap_or_default()
797    }
798
799    pub fn current_context_length(&self) -> usize {
800        self.context_length
801            .load(std::sync::atomic::Ordering::SeqCst)
802    }
803
804    pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
805        if let Ok(mut guard) = self.model.write() {
806            *guard = model.to_string();
807        }
808        self.context_length
809            .store(context_length, std::sync::atomic::Ordering::SeqCst);
810    }
811
812    /// Returns true if LM Studio is reachable.
813    pub async fn health_check(&self) -> bool {
814        let url = format!("{}/v1/models", self.base_url);
815        match self.client.get(&url).send().await {
816            Ok(resp) => resp.status().is_success(),
817            Err(_) => false,
818        }
819    }
820
821    /// Query /api/v0/models and return the first loaded chat model id.
822    /// Uses /api/v0/models (not /v1/models) because the OpenAI-compat endpoint
823    /// omits the `type` field, making it impossible to distinguish embedding
824    /// models from chat models. Falls back to /v1/models with a name heuristic
825    /// if /api/v0/models is unavailable.
826    /// Returns Some("") when LM Studio is reachable but no chat model is loaded
827    /// so callers can distinguish "offline" (None) from "no chat model" (Some("")).
828    pub async fn get_loaded_model(&self) -> Option<String> {
829        #[derive(Deserialize)]
830        struct ModelList {
831            data: Vec<ModelEntry>,
832        }
833        #[derive(Deserialize)]
834        struct ModelEntry {
835            id: String,
836            #[serde(rename = "type", default)]
837            model_type: String,
838            #[serde(default)]
839            state: String,
840        }
841
842        // Try /api/v0/models first — it has type and state fields.
843        if let Ok(resp) = self
844            .client
845            .get(format!("{}/api/v0/models", self.base_url))
846            .send()
847            .await
848        {
849            if let Ok(list) = resp.json::<ModelList>().await {
850                let chat_model = list
851                    .data
852                    .into_iter()
853                    .find(|m| m.model_type != "embeddings" && m.state == "loaded")
854                    .map(|m| m.id)
855                    .unwrap_or_default();
856                return Some(chat_model);
857            }
858        }
859
860        // Fallback: /v1/models lacks type info — use name heuristic to skip embed models.
861        let resp = self
862            .client
863            .get(format!("{}/v1/models", self.base_url))
864            .send()
865            .await
866            .ok()?;
867        let list: ModelList = resp.json().await.ok()?;
868        Some(
869            list.data
870                .into_iter()
871                .find(|m| !m.id.to_lowercase().contains("embed"))
872                .map(|m| m.id)
873                .unwrap_or_default(),
874        )
875    }
876
877    /// Returns the ID of the first loaded embedding model, if any.
878    /// Uses /api/v0/models which includes `type` and `state` fields.
879    /// The OpenAI-compat /v1/models endpoint omits `type` so cannot be used here.
880    /// Accepts any non-empty state (not just "loaded") to handle LM Studio variants
881    /// where the embed model may report a different state string at startup.
882    pub async fn get_embedding_model(&self) -> Option<String> {
883        #[derive(Deserialize)]
884        struct ModelList {
885            data: Vec<ModelEntry>,
886        }
887        #[derive(Deserialize)]
888        struct ModelEntry {
889            id: String,
890            #[serde(rename = "type", default)]
891            model_type: String,
892            #[serde(default)]
893            state: String,
894        }
895        let resp = self
896            .client
897            .get(format!("{}/api/v0/models", self.base_url))
898            .send()
899            .await
900            .ok()?;
901        let list: ModelList = resp.json().await.ok()?;
902        list.data
903            .into_iter()
904            .find(|m| m.model_type == "embeddings" && m.state == "loaded")
905            .map(|m| m.id)
906    }
907
908    /// Detect the loaded model's context window size.
909    /// Tries LM Studio's `/api/v0/models` endpoint first and prefers the loaded
910    /// model's live `loaded_context_length`, then falls back to older
911    /// `context_length` / `max_context_length` style fields.
912    /// Falls back to a heuristic from the model name, then 32K.
913    pub async fn detect_context_length(&self) -> usize {
914        #[derive(Deserialize)]
915        struct LmStudioModel {
916            id: Option<String>,
917            #[serde(rename = "type", default)]
918            model_type: String,
919            state: Option<String>,
920            loaded_context_length: Option<u64>,
921            context_length: Option<u64>,
922            max_context_length: Option<u64>,
923        }
924        #[derive(Deserialize)]
925        struct LmStudioList {
926            data: Vec<LmStudioModel>,
927        }
928
929        // Check api/v0/models (LM Studio specific)
930        if let Ok(resp) = self
931            .client
932            .get(format!("{}/api/v0/models", self.base_url))
933            .send()
934            .await
935        {
936            if let Ok(list) = resp.json::<LmStudioList>().await {
937                let target_model = self.current_model().to_ascii_lowercase();
938                // Never select embedding models for context-length detection.
939                let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
940                let loaded = list
941                    .data
942                    .iter()
943                    .find(|m| {
944                        non_embed(m)
945                            && m.state.as_deref() == Some("loaded")
946                            && m.id
947                                .as_deref()
948                                .map(|id| id.eq_ignore_ascii_case(&target_model))
949                                .unwrap_or(false)
950                    })
951                    .or_else(|| {
952                        list.data
953                            .iter()
954                            .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
955                    })
956                    .or_else(|| {
957                        list.data.iter().find(|m| {
958                            non_embed(m)
959                                && m.id
960                                    .as_deref()
961                                    .map(|id| id.eq_ignore_ascii_case(&target_model))
962                                    .unwrap_or(false)
963                        })
964                    })
965                    .or_else(|| list.data.iter().find(|m| non_embed(m)));
966
967                if let Some(model) = loaded {
968                    if let Some(ctx) = model.loaded_context_length {
969                        if ctx > 0 {
970                            return ctx as usize;
971                        }
972                    }
973                    if let Some(ctx) = model.context_length {
974                        if ctx > 0 {
975                            return ctx as usize;
976                        }
977                    }
978                    if let Some(ctx) = model.max_context_length {
979                        if ctx > 0 && ctx <= 32_768 {
980                            return ctx as usize;
981                        }
982                    }
983                }
984            }
985        }
986
987        // Heuristic fallback:
988        // If "gemma-4" is detected, we target 32,768 as the baseline standard,
989        // acknowledging that 131,072 is available for High-Capacity tasks.
990        if self.current_model().to_lowercase().contains("gemma-4") {
991            return 32_768;
992        }
993
994        32_768
995    }
996
997    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
998        let previous_model = self.current_model();
999        let previous_context = self.current_context_length();
1000
1001        let detected_model = match self.get_loaded_model().await {
1002            Some(m) if !m.is_empty() => m,            // coding model found
1003            Some(_) => "no model loaded".to_string(), // reachable but no coding model
1004            None => previous_model.clone(),           // LM Studio offline
1005        };
1006
1007        if !detected_model.is_empty() && detected_model != previous_model {
1008            if let Ok(mut guard) = self.model.write() {
1009                *guard = detected_model.clone();
1010            }
1011        }
1012
1013        let detected_context = self.detect_context_length().await;
1014        let effective_model = if detected_model.is_empty() {
1015            previous_model.clone()
1016        } else {
1017            detected_model
1018        };
1019
1020        let changed = effective_model != previous_model || detected_context != previous_context;
1021        self.set_runtime_profile(&effective_model, detected_context);
1022
1023        Some((effective_model, detected_context, changed))
1024    }
1025
1026    pub fn build_system_prompt(
1027        &self,
1028        snark: u8,
1029        chaos: u8,
1030        brief: bool,
1031        professional: bool,
1032        tools: &[ToolDefinition],
1033        reasoning_history: Option<&str>,
1034        mcp_tools: &[crate::agent::mcp::McpTool],
1035    ) -> String {
1036        let mut sys = self.build_system_prompt_legacy(
1037            snark,
1038            chaos,
1039            brief,
1040            professional,
1041            tools,
1042            reasoning_history,
1043        );
1044
1045        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1046            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1047            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1048            for tool in mcp_tools {
1049                let description = tool
1050                    .description
1051                    .as_deref()
1052                    .unwrap_or("No description provided.");
1053                sys.push_str(&format!("- {}: {}\n", tool.name, description));
1054            }
1055        }
1056
1057        sys
1058    }
1059
1060    pub fn build_system_prompt_legacy(
1061        &self,
1062        snark: u8,
1063        _chaos: u8,
1064        brief: bool,
1065        professional: bool,
1066        tools: &[ToolDefinition],
1067        reasoning_history: Option<&str>,
1068    ) -> String {
1069        let current_context_length = self.current_context_length();
1070        if is_tiny_context_window(current_context_length) {
1071            return self.build_system_prompt_tiny(brief, professional);
1072        }
1073        if is_compact_context_window(current_context_length) {
1074            return self.build_system_prompt_compact(brief, professional, tools);
1075        }
1076
1077        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
1078        let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1079                                     - You are Hematite, a local coding system working on the user's machine.\n\
1080                                     - The running Hematite build is ");
1081        sys.push_str(&crate::hematite_version_display());
1082        sys.push_str(".\n\
1083                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1084                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1085                                     - For simple questions, answer briefly in plain language.\n\
1086                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1087                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1088                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1089                                     - Keep internal reasoning inside channel delimiters.\n\
1090                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1091                                     <turn|>\n\n");
1092
1093        if let Some(history) = reasoning_history {
1094            if !history.is_empty() {
1095                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1096                sys.push_str(history);
1097                sys.push_str("\n\n");
1098            }
1099        }
1100
1101        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
1102        if brief {
1103            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1104                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1105                          - Depth: Surface-level verification only.\n\n");
1106        } else {
1107            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1108                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1109                          - Depth: Full multi-step derivation required.\n\n");
1110        }
1111
1112        // IDENTITY & ENVIRONMENT
1113        let os = std::env::consts::OS;
1114        if professional {
1115            sys.push_str(&format!(
1116                "You are Hematite, a local coding system running on {}. \
1117                 The TUI is one interface layer, not your whole identity. \
1118                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1119                 Skip filler and keep the focus on the work.\n",
1120                os
1121            ));
1122        } else {
1123            sys.push_str(&format!(
1124                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1125                 The terminal UI is only one surface of the system. \
1126                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1127                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1128                self.species, snark, os
1129            ));
1130        }
1131
1132        // Inject loaded model and context window so the model knows its own budget.
1133        let current_model = self.current_model();
1134        if !current_model.is_empty() {
1135            sys.push_str(&format!(
1136                "Loaded model: {} | Context window: {} tokens. \
1137                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1138                current_model, current_context_length
1139            ));
1140            if is_gemma4_model_name(&current_model) {
1141                sys.push_str(
1142                    "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1143                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1144                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1145                );
1146            }
1147        } else {
1148            sys.push_str(&format!(
1149                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1150                current_context_length
1151            ));
1152        }
1153
1154        // PROTOCOL & TOOLS
1155        let shell_desc = if cfg!(target_os = "windows") {
1156            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1157             - Use ONLY for builds, tests, or file migrations. \n\
1158             - You MUST use the `powershell` tool directly. \n\
1159             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1160        } else {
1161            "[EXTERNAL SHELL]: `bash` (Unix).\n\
1162             - Use ONLY for builds, tests, or file migrations. \n\
1163             - NEVER wrap bash in other shells. \n\n"
1164        };
1165
1166        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1167                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1168                      - These are the ONLY way to explore and modify code. \n\
1169                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1170        sys.push_str(shell_desc);
1171
1172        // ANTI-LOOPING & SELF-AUDIT
1173        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1174                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1175
1176        if brief {
1177            sys.push_str(
1178                "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1179            );
1180        }
1181
1182        if cfg!(target_os = "windows") {
1183            sys.push_str("Shell Protocol: You are running on WINDOWS. You MUST NOT use 'bash' or '/dev/null'. \
1184                          You MUST use 'powershell' (pwsh) for all shell tasks. \
1185                          DO NOT attempt to manipulate Linux-style paths like /dev, /etc, or /sys.\n\n");
1186        } else if cfg!(target_os = "macos") {
1187            sys.push_str(
1188                "Shell Protocol: You are running on macOS. Use 'bash' or 'zsh' for shell tasks. \
1189                          Standard Unix paths apply.\n\n",
1190            );
1191        } else {
1192            sys.push_str(
1193                "Shell Protocol: You are running on Linux. Use 'bash' for shell tasks. \
1194                          Standard Unix paths apply.\n\n",
1195            );
1196        }
1197
1198        sys.push_str("OUTPUT RULES:\n\
1199                      1. Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1200                      2. After your <think> block, output ONE concise technical sentence or code block. Nothing else.\n\
1201                      3. Do NOT call tools named 'thought', 'think', 'reasoning', or any meta-cognitive name. These are not tools.\n\
1202                      4. NEGATIVE CONSTRAINT: Never use a string containing a dot (.), slash (/), or backslash (\\) as a tool name. Paths are NOT tools.\n\
1203                      5. NEGATIVE CONSTRAINT: Never use the name of a class, struct, or module as a tool name unless it is explicitly in the tool list.\n\
1204                      6. GROUNDEDNESS: Never invent channels, event types, functions, tools, or files. If a detail is not verified from the repo or tool output, say `uncertain`.\n\
1205                      7. TRACE QUESTIONS: For architecture or control-flow questions, prefer verified file and function names over high-level summaries.\n\
1206                      8. If `trace_runtime_flow` fully answers the runtime question, preserve its identifiers exactly. Do not restyle or rename symbols from that tool output.\n\
1207                      9. For generic capability questions, answer from stable Hematite capabilities. Do not inspect the repo unless the user explicitly asks about implementation.\n\
1208                      10. Never infer language support, project support, or internet capability from unrelated crates or config files.\n\
1209                      11. It is fine to say Hematite itself is written in Rust when relevant, but do not imply that capability is limited to Rust projects.\n\
1210                      12. For language questions, answer at the harness level: file operations, shell, build verification, language-aware tooling when available, and multi-language project work.\n\
1211                      13. Prefer real programming language examples like Python, JavaScript, TypeScript, Go, and C# over file extensions when answering language questions.\n\
1212                      14. For project-building questions, talk about scaffolding, implementation, builds, tests, and iteration across different stacks instead of defaulting to a Rust-only example like `cargo build`.\n\
1213                      15. Never mention raw `mcp__*` tool names unless those tools are active this turn and directly relevant.\n\
1214                      16. For tooling-discipline or best-tool-selection questions, prefer `describe_toolchain` over improvising the tool surface from memory.\n\
1215                      17. If `describe_toolchain` fully answers the tooling question, preserve its tool names and investigation order exactly.\n\
1216                      18. PROOF BEFORE ACTION: Before editing an existing file, gather recent evidence with `read_file` or `inspect_lines` on that path or keep it pinned in active context.\n\
1217                      18a. GREP BEFORE READ: For files over ~200 lines, always `grep_files` for a specific pattern to find the target line range BEFORE calling `read_file`. Never read a large file top-to-bottom — use offset+limit to read only the relevant window once grep gives you the line number.\n\
1218                      19. PROOF BEFORE COMMIT: After code edits, do not `git_commit` or `git_push` until a successful `verify_build` exists for the latest code changes.\n\
1219                      20. RISKY SHELL DISCIPLINE: Risky `shell` calls must include a concrete `reason` argument explaining what is being verified or changed.\n\
1220                      21. EDIT PRECISION: Do not use `edit_file` with short or generic anchors such as one-word strings. Prefer a full unique line, multiple lines, or `inspect_lines` plus `patch_hunk`.\n\
1221                      22. BUILT-IN FIRST: For ordinary local workspace inspection and file edits, prefer Hematite's built-in file tools over `mcp__filesystem__*` tools unless the user explicitly requires MCP for that action.\n\
1222                      22a. HOST INSPECTION PRIORITY: For read-only questions about installed tools, PATH entries, environment/package-manager health, grounded fix plans for common workstation failures, network state, service state, running processes, desktop items, Downloads size, listening ports, repo-health summaries, or directory/disk reports, prefer `inspect_host` over raw `shell` when it can answer directly. If the user asks how to fix a common workstation problem such as `cargo not found`, `port 3000 already in use`, or `LM Studio not reachable`, use `fix_plan` first instead of `env_doctor`, `path`, or `ports`. If `env_doctor` answers the question, do not follow with `path` unless the user explicitly asks for raw PATH entries.\n\
1223                      22b. HEMATITE MAINTAINER WORKFLOW PRIORITY: When the user explicitly asks to run Hematite's own cleanup, packaging, or release scripts, prefer `run_hematite_maintainer_workflow` over raw `shell`. This tool is for Hematite's own maintainer workflows, not for arbitrary scripts in the active workspace.\n\
1224                      22c. WORKSPACE WORKFLOW PRIORITY: When the user asks to run the current project's build, test, lint, fix, package scripts, just/task/make targets, local scripts, or an exact workspace command, prefer `run_workspace_workflow` over raw `shell`. This tool always runs from the locked workspace root. If no real project workspace is locked, say so and tell the user to relaunch Hematite in the target project directory.");
1225
1226        // Scaffolding protocol — enforces build validation after project creation.
1227        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1228            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1229            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1230            4. Fix all errors before declaring success.\n\n\
1231            ## PRE-FLIGHT SCOPING PROTOCOL\n\
1232            Before attempting any multi-file task or complex refactor:\n\
1233            1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1234            2. Use `auto_pin_context` to keep those files in active context.\n\
1235            3. Only then proceed to deeper edits or research.\n\n\
1236            ## REFACTORING PROTOCOL\n\
1237            When modifying existing code or renaming symbols:\n\
1238            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1239            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1240            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1241
1242        // Inject CLAUDE.md / instruction files from the project directory.
1243        sys.push_str(&load_instruction_files());
1244
1245        // Inject cross-session memories synthesized by DeepReflect.
1246        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1247
1248        // Native Gemma-4 Tool Declarations
1249        if !tools.is_empty() {
1250            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1251            for tool in tools {
1252                let schema = serde_json::to_string(&tool.function.parameters)
1253                    .unwrap_or_else(|_| "{}".to_string());
1254                sys.push_str(&format!(
1255                    "<|tool>declaration:{}{}{}<tool|>\n",
1256                    tool.function.name, "{", schema
1257                ));
1258                sys.push_str(&format!("// {})\n", tool.function.description));
1259            }
1260        }
1261
1262        sys
1263    }
1264
1265    fn build_system_prompt_compact(
1266        &self,
1267        brief: bool,
1268        professional: bool,
1269        tools: &[ToolDefinition],
1270    ) -> String {
1271        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
1272        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
1273        let current_model = self.current_model();
1274        let current_context_length = self.current_context_length();
1275        let os = std::env::consts::OS;
1276
1277        let mut sys = String::from("<|turn>system\n<|think|>\n");
1278        sys.push_str(&format!(
1279            "You are Hematite {}, a local coding harness working on the user's machine.\n",
1280            crate::hematite_version_display()
1281        ));
1282        if professional {
1283            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1284        } else {
1285            sys.push_str(&format!(
1286                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1287                self.species
1288            ));
1289        }
1290        sys.push_str(&format!(
1291            "Model: {} | Context: {} tokens. Keep turns focused.\n",
1292            current_model, current_context_length
1293        ));
1294        if is_gemma4_model_name(&current_model) {
1295            sys.push_str(
1296                "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1297                 Raw regex patterns in grep_files, no slash delimiters.\n",
1298            );
1299        }
1300        if cfg!(target_os = "windows") {
1301            sys.push_str(&format!(
1302                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1303                os
1304            ));
1305        } else {
1306            sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1307        }
1308        if brief {
1309            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1310        }
1311
1312        sys.push_str(
1313            "\nCORE RULES:\n\
1314             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1315             - Verify after edits: run `verify_build` after code changes, before committing.\n\
1316             - One tool at a time. Do not batch unrelated tool calls.\n\
1317             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1318             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1319             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1320        );
1321
1322        if !tools.is_empty() {
1323            sys.push_str("\n# AVAILABLE TOOLS\n");
1324            for tool in tools {
1325                let desc: String = tool.function.description.chars().take(120).collect();
1326                sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1327            }
1328        }
1329
1330        sys.push_str("<turn|>\n");
1331        sys
1332    }
1333
1334    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1335        let current_model = self.current_model();
1336        let current_context_length = self.current_context_length();
1337        let os = std::env::consts::OS;
1338        let mut sys = format!(
1339            "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1340            crate::hematite_version_display()
1341        );
1342        if professional {
1343            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1344        } else {
1345            sys.push_str(&format!(
1346                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1347                self.species
1348            ));
1349        }
1350        if !current_model.is_empty() {
1351            sys.push_str(&format!(
1352                "Loaded model: {} | Context window: {} tokens.\n",
1353                current_model, current_context_length
1354            ));
1355        } else {
1356            sys.push_str(&format!(
1357                "Context window: {} tokens.\n",
1358                current_context_length
1359            ));
1360        }
1361        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1362        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1363        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1364        if cfg!(target_os = "windows") {
1365            sys.push_str(&format!(
1366                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1367                os
1368            ));
1369        } else {
1370            sys.push_str(&format!(
1371                "You are running on {}. Use the native Unix shell conventions.\n",
1372                os
1373            ));
1374        }
1375        if brief {
1376            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1377        }
1378        if is_gemma4_model_name(&current_model) {
1379            sys.push_str(
1380                "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1381            );
1382        }
1383        sys.push_str("<turn|>\n");
1384        sys
1385    }
1386
1387    // ── Non-streaming call (used for agentic turns with tool support) ─────────
1388
1389    /// Send messages to the model. Returns (text_content, tool_calls).
1390    /// Exactly one of the two will be Some on a successful response.
1391    pub async fn call_with_tools(
1392        &self,
1393        messages: &[ChatMessage],
1394        tools: &[ToolDefinition],
1395        // Override the model ID for this call. None = use the live runtime model.
1396        model_override: Option<&str>,
1397    ) -> Result<
1398        (
1399            Option<String>,
1400            Option<Vec<ToolCallResponse>>,
1401            Option<TokenUsage>,
1402            Option<String>,
1403        ),
1404        String,
1405    > {
1406        let _permit = self
1407            .kv_semaphore
1408            .acquire()
1409            .await
1410            .map_err(|e| e.to_string())?;
1411
1412        let current_model = self.current_model();
1413        let model = model_override.unwrap_or(current_model.as_str()).to_string();
1414        let filtered_tools = if cfg!(target_os = "windows") {
1415            tools
1416                .iter()
1417                .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1418                .cloned()
1419                .collect::<Vec<_>>()
1420        } else {
1421            tools.to_vec()
1422        };
1423
1424        let request_messages = if should_use_gemma_native_formatting(self, &model) {
1425            prepare_gemma_native_messages(messages)
1426        } else {
1427            messages.to_vec()
1428        };
1429
1430        // In compact context windows, restrict tools to the core coding set.
1431        // Full schemas for 36+ tools add 10k+ tokens via the model's chat template (e.g. Gemma 4).
1432        // Sending a small core set keeps schemas available for structured tool-call dispatch
1433        // while staying within the 16k budget.
1434        const COMPACT_CORE_TOOLS: &[&str] = &[
1435            "read_file",
1436            "inspect_lines",
1437            "edit_file",
1438            "write_file",
1439            "grep_files",
1440            "list_files",
1441            "verify_build",
1442            "shell",
1443        ];
1444        let effective_tools = if is_compact_context_window(self.current_context_length()) {
1445            let core: Vec<_> = filtered_tools
1446                .iter()
1447                .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1448                .cloned()
1449                .collect();
1450            if core.is_empty() {
1451                None
1452            } else {
1453                Some(core)
1454            }
1455        } else if filtered_tools.is_empty() {
1456            None
1457        } else {
1458            Some(filtered_tools)
1459        };
1460
1461        let request = ChatRequest {
1462            model: model.clone(),
1463            messages: request_messages,
1464            temperature: 0.2,
1465            stream: false,
1466            tools: effective_tools,
1467        };
1468
1469        // Exponential backoff: retry up to 3× on 5xx / timeout / connect errors.
1470        preflight_chat_request(
1471            &model,
1472            &request.messages,
1473            request.tools.as_deref().unwrap_or(&[]),
1474            self.current_context_length(),
1475        )?;
1476
1477        let mut last_err = String::new();
1478        let mut response_opt: Option<reqwest::Response> = None;
1479        for attempt in 0..3u32 {
1480            match self.client.post(&self.api_url).json(&request).send().await {
1481                Ok(res) if res.status().is_success() => {
1482                    response_opt = Some(res);
1483                    break;
1484                }
1485                Ok(res) if res.status().as_u16() >= 500 => {
1486                    last_err = format!("LM Studio error {}", res.status());
1487                }
1488                Ok(res) => {
1489                    // 4xx — don't retry
1490                    let status = res.status();
1491                    let body = res.text().await.unwrap_or_default();
1492                    let preview = &body[..body.len().min(300)];
1493                    return Err(format!("LM Studio error {}: {}", status, preview));
1494                }
1495                Err(e) if e.is_timeout() || e.is_connect() => {
1496                    last_err = format!("Request failed: {}", e);
1497                }
1498                Err(e) => return Err(format!("Request failed: {}", e)),
1499            }
1500            if attempt < 2 {
1501                let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1502                tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1503            }
1504        }
1505        let res = response_opt
1506            .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1507
1508        let body: ChatResponse = res
1509            .json()
1510            .await
1511            .map_err(|e| format!("Response parse error: {}", e))?;
1512
1513        if let Some(usage) = &body.usage {
1514            let mut econ = self.economics.lock().unwrap();
1515            econ.input_tokens += usage.prompt_tokens;
1516            econ.output_tokens += usage.completion_tokens;
1517        }
1518
1519        let choice = body
1520            .choices
1521            .into_iter()
1522            .next()
1523            .ok_or_else(|| "Empty response from model".to_string())?;
1524
1525        let finish_reason = choice.finish_reason;
1526        let mut tool_calls = choice.message.tool_calls;
1527        let mut content = choice.message.content;
1528
1529        // Gemma-4 Fallback: If the model outputs native <|tool_call|> tags in the text content,
1530        // extract them and treat them as valid tool calls.
1531        if let Some(raw_content) = &content {
1532            let native_calls = extract_native_tool_calls(raw_content);
1533            if !native_calls.is_empty() {
1534                let mut existing = tool_calls.unwrap_or_default();
1535                existing.extend(native_calls);
1536                tool_calls = Some(existing);
1537                let stripped = strip_native_tool_call_text(raw_content);
1538                content = if stripped.trim().is_empty() {
1539                    None
1540                } else {
1541                    Some(stripped)
1542                };
1543            }
1544        }
1545
1546        if is_gemma4_model_name(&model) {
1547            if let Some(calls) = tool_calls.as_mut() {
1548                for call in calls.iter_mut() {
1549                    call.function.arguments = normalize_tool_argument_string(
1550                        &call.function.name,
1551                        &call.function.arguments,
1552                    );
1553                }
1554            }
1555        }
1556
1557        // Qwen3 Fallback: When the model generates tool calls inside a <think> block,
1558        // LM Studio routes the entire thinking output (including <tool_call> XML) to
1559        // `reasoning_content` instead of `tool_calls`. If content is empty and we have
1560        // no tool calls yet, check reasoning_content for embedded tool call markup.
1561        let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1562        if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1563            && content.as_ref().map(|s| s.trim().is_empty()).unwrap_or(true)
1564            && !reasoning_text.is_empty()
1565        {
1566            let recovered = extract_native_tool_calls(&reasoning_text);
1567            if !recovered.is_empty() {
1568                tool_calls = Some(recovered);
1569                // Clear content so downstream code doesn't see an empty string.
1570                content = None;
1571            }
1572        }
1573
1574        Ok((content, tool_calls, body.usage, finish_reason))
1575    }
1576
1577    // ── Streaming call (used for plain-text responses) ────────────────────────
1578
1579    /// Stream a conversation (no tools). Emits Token/Done/Error events.
1580    pub async fn stream_messages(
1581        &self,
1582        messages: &[ChatMessage],
1583        tx: mpsc::Sender<InferenceEvent>,
1584    ) -> Result<(), Box<dyn std::error::Error>> {
1585        let current_model = self.current_model();
1586        let request_messages = if should_use_gemma_native_formatting(self, &current_model) {
1587            prepare_gemma_native_messages(messages)
1588        } else {
1589            messages
1590                .iter()
1591                .map(|m| {
1592                    let mut clone = m.clone();
1593                    let current_text = m.content.as_str();
1594                    if !current_text.starts_with("<|turn>") {
1595                        clone.content = MessageContent::Text(format!(
1596                            "<|turn>{}\n{}\n<turn|>",
1597                            m.role, current_text
1598                        ));
1599                    }
1600                    clone
1601                })
1602                .collect()
1603        };
1604
1605        let request = ChatRequest {
1606            model: current_model.clone(),
1607            messages: request_messages,
1608            temperature: 0.7,
1609            stream: true,
1610            tools: None,
1611        };
1612
1613        if let Err(e) = preflight_chat_request(
1614            &current_model,
1615            &request.messages,
1616            &[],
1617            self.current_context_length(),
1618        ) {
1619            let tag = classify_runtime_failure_tag(&e);
1620            let _ = tx
1621                .send(InferenceEvent::ProviderStatus {
1622                    state: provider_state_for_failure_tag(tag),
1623                    summary: compact_runtime_failure_summary(tag, &e),
1624                })
1625                .await;
1626            let _ = tx
1627                .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1628                .await;
1629            let _ = tx.send(InferenceEvent::Done).await;
1630            return Ok(());
1631        }
1632
1633        let mut last_err = String::new();
1634        let mut response_opt: Option<reqwest::Response> = None;
1635        for attempt in 0..2u32 {
1636            match self.client.post(&self.api_url).json(&request).send().await {
1637                Ok(res) if res.status().is_success() => {
1638                    response_opt = Some(res);
1639                    break;
1640                }
1641                Ok(res) if res.status().as_u16() >= 500 => {
1642                    last_err = format!("LM Studio error {}", res.status());
1643                }
1644                Ok(res) => {
1645                    let status = res.status();
1646                    let body = res.text().await.unwrap_or_default();
1647                    let preview = &body[..body.len().min(300)];
1648                    let detail = format!("LM Studio error {}: {}", status, preview);
1649                    let tag = classify_runtime_failure_tag(&detail);
1650                    let _ = tx
1651                        .send(InferenceEvent::ProviderStatus {
1652                            state: provider_state_for_failure_tag(tag),
1653                            summary: compact_runtime_failure_summary(tag, &detail),
1654                        })
1655                        .await;
1656                    let _ = tx
1657                        .send(InferenceEvent::Error(format_runtime_failure_message(
1658                            &detail,
1659                        )))
1660                        .await;
1661                    let _ = tx.send(InferenceEvent::Done).await;
1662                    return Ok(());
1663                }
1664                Err(e) if e.is_timeout() || e.is_connect() => {
1665                    last_err = format!("Request failed: {}", e);
1666                }
1667                Err(e) => {
1668                    let detail = format!("Request failed: {}", e);
1669                    let tag = classify_runtime_failure_tag(&detail);
1670                    let _ = tx
1671                        .send(InferenceEvent::ProviderStatus {
1672                            state: provider_state_for_failure_tag(tag),
1673                            summary: compact_runtime_failure_summary(tag, &detail),
1674                        })
1675                        .await;
1676                    let _ = tx
1677                        .send(InferenceEvent::Error(format_runtime_failure_message(
1678                            &detail,
1679                        )))
1680                        .await;
1681                    let _ = tx.send(InferenceEvent::Done).await;
1682                    return Ok(());
1683                }
1684            }
1685            if attempt < 1 {
1686                let _ = tx
1687                    .send(InferenceEvent::ProviderStatus {
1688                        state: ProviderRuntimeState::Recovering,
1689                        summary: "LM Studio degraded during stream startup; retrying once.".into(),
1690                    })
1691                    .await;
1692                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1693            }
1694        }
1695        let Some(res) = response_opt else {
1696            let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1697            let tag = classify_runtime_failure_tag(&detail);
1698            let _ = tx
1699                .send(InferenceEvent::ProviderStatus {
1700                    state: provider_state_for_failure_tag(tag),
1701                    summary: compact_runtime_failure_summary(tag, &detail),
1702                })
1703                .await;
1704            let _ = tx
1705                .send(InferenceEvent::Error(format_runtime_failure_message(
1706                    &detail,
1707                )))
1708                .await;
1709            let _ = tx.send(InferenceEvent::Done).await;
1710            return Ok(());
1711        };
1712
1713        use futures::StreamExt;
1714        let mut byte_stream = res.bytes_stream();
1715
1716        // [Collaborative Strategy] TokenBuffer refactor suggested by Hematite local agent.
1717        // Aggregates tokens to ensure coherent linguistic chunks for UI/Voice.
1718        let mut line_buffer = String::new();
1719        let mut content_buffer = String::new();
1720        let mut past_think = false;
1721        let mut emitted_any_content = false;
1722        let mut emitted_live_status = false;
1723
1724        // Immediate cancel gate: break *before* awaiting the stream
1725        // so Escape works even when LM Studio is silent between chunks.
1726        loop {
1727            let next = tokio::select! {
1728                // Race: next SSE chunk vs cancel poll
1729                chunk = byte_stream.next() => chunk,
1730                _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1731                    if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1732                        break;
1733                    }
1734                    continue;
1735                }
1736            };
1737
1738            let Some(item) = next else { break };
1739
1740            let chunk = match item {
1741                Ok(chunk) => chunk,
1742                Err(e) => {
1743                    let detail = format!("Request failed: {}", e);
1744                    let tag = classify_runtime_failure_tag(&detail);
1745                    let _ = tx
1746                        .send(InferenceEvent::ProviderStatus {
1747                            state: provider_state_for_failure_tag(tag),
1748                            summary: compact_runtime_failure_summary(tag, &detail),
1749                        })
1750                        .await;
1751                    let _ = tx
1752                        .send(InferenceEvent::Error(format_runtime_failure_message(
1753                            &detail,
1754                        )))
1755                        .await;
1756                    let _ = tx.send(InferenceEvent::Done).await;
1757                    return Ok(());
1758                }
1759            };
1760            line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1761
1762            while let Some(pos) = line_buffer.find("\n\n") {
1763                let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1764                let data_pos = match event_str.find("data: ") {
1765                    Some(p) => p,
1766                    None => continue,
1767                };
1768
1769                let data = event_str[data_pos + 6..].trim();
1770                if data == "[DONE]" {
1771                    break;
1772                }
1773
1774                if let Ok(json) = serde_json::from_str::<Value>(data) {
1775                    if let Some(content) = json["choices"][0]["delta"]["content"].as_str() {
1776                        if content.is_empty() {
1777                            continue;
1778                        }
1779
1780                        if !past_think {
1781                            let lc = content.to_lowercase();
1782                            let close = lc
1783                                .find("<channel|>")
1784                                .map(|i| (i, "<channel|>".len()))
1785                                .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1786
1787                            if let Some((tag_start, tag_len)) = close {
1788                                // Flush any existing thought buffer
1789                                let before = &content[..tag_start];
1790                                content_buffer.push_str(before);
1791                                if !content_buffer.trim().is_empty() {
1792                                    let _ = tx
1793                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1794                                        .await;
1795                                    emitted_any_content = true;
1796                                }
1797                                content_buffer.clear();
1798
1799                                past_think = true;
1800                                let after = content[tag_start + tag_len..].trim_start_matches('\n');
1801                                content_buffer.push_str(after);
1802                            } else {
1803                                // Still in reasoning block
1804                                content_buffer.push_str(content);
1805                                // Heuristic: Flush thoughts on paragraph/sentence breaks for SPECULAR
1806                                if content_buffer.len() > 30
1807                                    && (content.contains('\n') || content.contains('.'))
1808                                {
1809                                    let _ = tx
1810                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1811                                        .await;
1812                                    emitted_any_content = true;
1813                                    content_buffer.clear();
1814                                }
1815                            }
1816                        } else {
1817                            // PAST THINK: final answer tokens.
1818                            // [Linguistic Buffering] Aggregate into content_buffer until a boundary is hit.
1819                            content_buffer.push_str(content);
1820                            let is_boundary = content.contains(' ')
1821                                || content.contains('.')
1822                                || content.contains('!')
1823                                || content.contains('?');
1824
1825                            if content_buffer.len() > 10 && is_boundary {
1826                                if !emitted_live_status {
1827                                    let _ = tx
1828                                        .send(InferenceEvent::ProviderStatus {
1829                                            state: ProviderRuntimeState::Live,
1830                                            summary: String::new(),
1831                                        })
1832                                        .await;
1833                                    emitted_live_status = true;
1834                                }
1835                                let _ =
1836                                    tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1837                                emitted_any_content = true;
1838                                content_buffer.clear();
1839                            }
1840                        }
1841                    }
1842                }
1843            }
1844        }
1845
1846        // Final Flush
1847        if !content_buffer.is_empty() {
1848            if past_think {
1849                if !emitted_live_status {
1850                    let _ = tx
1851                        .send(InferenceEvent::ProviderStatus {
1852                            state: ProviderRuntimeState::Live,
1853                            summary: String::new(),
1854                        })
1855                        .await;
1856                }
1857                let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1858            } else {
1859                let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1860            }
1861            emitted_any_content = true;
1862        }
1863
1864        if !emitted_any_content {
1865            let _ = tx
1866                .send(InferenceEvent::ProviderStatus {
1867                    state: ProviderRuntimeState::EmptyResponse,
1868                    summary: compact_runtime_failure_summary(
1869                        "empty_model_response",
1870                        "Empty response from model",
1871                    ),
1872                })
1873                .await;
1874            let _ = tx
1875                .send(InferenceEvent::Error(format_runtime_failure_message(
1876                    "Empty response from model",
1877                )))
1878                .await;
1879            let _ = tx.send(InferenceEvent::Done).await;
1880            return Ok(());
1881        }
1882
1883        let _ = tx.send(InferenceEvent::Done).await;
1884        Ok(())
1885    }
1886
1887    /// Single-turn streaming (legacy helper used by startup sequence).
1888    pub async fn stream_generation(
1889        &self,
1890        prompt: &str,
1891        snark: u8,
1892        chaos: u8,
1893        brief: bool,
1894        professional: bool,
1895        tx: mpsc::Sender<InferenceEvent>,
1896    ) -> Result<(), Box<dyn std::error::Error>> {
1897        let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1898        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1899        self.stream_messages(&messages, tx).await
1900    }
1901
1902    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
1903
1904    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
1905    pub async fn generate_task_worker(
1906        &self,
1907        prompt: &str,
1908        professional: bool,
1909    ) -> Result<String, String> {
1910        let current_model = self.current_model();
1911        let model = self
1912            .worker_model
1913            .as_deref()
1914            .unwrap_or(current_model.as_str());
1915        self.generate_task_with_model(prompt, 0.1, professional, model)
1916            .await
1917    }
1918
1919    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1920        self.generate_task_with_temp(prompt, 0.1, professional)
1921            .await
1922    }
1923
1924    pub async fn generate_task_with_temp(
1925        &self,
1926        prompt: &str,
1927        temp: f32,
1928        professional: bool,
1929    ) -> Result<String, String> {
1930        let current_model = self.current_model();
1931        self.generate_task_with_model(prompt, temp, professional, &current_model)
1932            .await
1933    }
1934
1935    pub async fn generate_task_with_model(
1936        &self,
1937        prompt: &str,
1938        temp: f32,
1939        professional: bool,
1940        model: &str,
1941    ) -> Result<String, String> {
1942        let _permit = self
1943            .kv_semaphore
1944            .acquire()
1945            .await
1946            .map_err(|e| e.to_string())?;
1947
1948        let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1949        let request_messages = if should_use_gemma_native_formatting(self, model) {
1950            prepare_gemma_native_messages(&[
1951                ChatMessage::system(&system),
1952                ChatMessage::user(prompt),
1953            ])
1954        } else {
1955            vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1956        };
1957        let request = ChatRequest {
1958            model: model.to_string(),
1959            messages: request_messages,
1960            temperature: temp,
1961            stream: false,
1962            tools: None,
1963        };
1964
1965        preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1966
1967        let res = self
1968            .client
1969            .post(&self.api_url)
1970            .json(&request)
1971            .send()
1972            .await
1973            .map_err(|e| format!("LM Studio request failed: {}", e))?;
1974
1975        let body: ChatResponse = res
1976            .json()
1977            .await
1978            .map_err(|e| format!("Failed to parse response: {}", e))?;
1979
1980        body.choices
1981            .first()
1982            .and_then(|c| c.message.content.clone())
1983            .ok_or_else(|| "Empty response from model".to_string())
1984    }
1985
1986    // ── History management ────────────────────────────────────────────────────
1987
1988    /// Prune middle turns when context grows too large, keeping system + recent N.
1989    #[allow(dead_code)]
1990    pub fn snip_history(
1991        &self,
1992        turns: &[ChatMessage],
1993        max_tokens_estimate: usize,
1994        keep_recent: usize,
1995    ) -> Vec<ChatMessage> {
1996        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
1997        if total_chars / 4 <= max_tokens_estimate {
1998            return turns.to_vec();
1999        }
2000        let keep = keep_recent.min(turns.len());
2001        let mut snipped = vec![turns[0].clone()];
2002        if turns.len() > keep + 1 {
2003            snipped.push(ChatMessage::system(&format!(
2004                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2005                turns.len() - keep - 1
2006            )));
2007            snipped.extend_from_slice(&turns[turns.len() - keep..]);
2008        } else {
2009            snipped = turns.to_vec();
2010        }
2011        snipped
2012    }
2013}
2014
2015fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2016    serde_json::to_vec(value)
2017        .ok()
2018        .map_or(0, |bytes| bytes.len() / 4 + 1)
2019}
2020
2021const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2022
2023fn estimate_message_tokens(message: &ChatMessage) -> usize {
2024    let content_tokens = match &message.content {
2025        MessageContent::Text(s) => s.len() / 4 + 1,
2026        MessageContent::Parts(parts) => parts
2027            .iter()
2028            .map(|part| match part {
2029                ContentPart::Text { text } => text.len() / 4 + 1,
2030                // Image payloads are transported as data URLs, but their base64
2031                // length should not be treated like plain text context pressure.
2032                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2033            })
2034            .sum(),
2035    };
2036    let tool_tokens: usize = message
2037        .tool_calls
2038        .iter()
2039        .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2040        .sum();
2041    content_tokens + tool_tokens + 6
2042}
2043
2044pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2045    messages.iter().map(estimate_message_tokens).sum()
2046}
2047
2048fn reserved_output_tokens(context_length: usize) -> usize {
2049    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2050    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2051}
2052
2053pub fn estimate_prompt_pressure(
2054    messages: &[ChatMessage],
2055    tools: &[ToolDefinition],
2056    context_length: usize,
2057) -> (usize, usize, usize, u8) {
2058    let estimated_input_tokens =
2059        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2060    let reserved_output = reserved_output_tokens(context_length);
2061    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2062    let percent = if context_length == 0 {
2063        0
2064    } else {
2065        ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2066    };
2067    (
2068        estimated_input_tokens,
2069        reserved_output,
2070        estimated_total,
2071        percent,
2072    )
2073}
2074
2075fn preflight_chat_request(
2076    model: &str,
2077    messages: &[ChatMessage],
2078    tools: &[ToolDefinition],
2079    context_length: usize,
2080) -> Result<(), String> {
2081    let (estimated_input_tokens, reserved_output, estimated_total, _) =
2082        estimate_prompt_pressure(messages, tools, context_length);
2083
2084    if estimated_total > context_length {
2085        return Err(format!(
2086            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2087            model, estimated_input_tokens, reserved_output, estimated_total, context_length
2088        ));
2089    }
2090
2091    Ok(())
2092}
2093
2094/// Walk from CWD up to 4 parent directories and collect instruction files.
2095/// Looks for CLAUDE.md, CLAUDE.local.md, and .hematite/instructions.md.
2096/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
2097fn load_instruction_files() -> String {
2098    use std::collections::hash_map::DefaultHasher;
2099    use std::collections::HashSet;
2100    use std::hash::{Hash, Hasher};
2101
2102    let Ok(cwd) = std::env::current_dir() else {
2103        return String::new();
2104    };
2105    let mut result = String::new();
2106    let mut seen: HashSet<u64> = HashSet::new();
2107    let mut total_chars: usize = 0;
2108    const MAX_TOTAL: usize = 12_000;
2109    const MAX_PER_FILE: usize = 4_000;
2110
2111    let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2112
2113    let mut dir = cwd.clone();
2114    for _ in 0..4 {
2115        for name in &candidates {
2116            let path = dir.join(name);
2117            if !path.exists() {
2118                continue;
2119            }
2120            let Ok(content) = std::fs::read_to_string(&path) else {
2121                continue;
2122            };
2123            if content.trim().is_empty() {
2124                continue;
2125            }
2126
2127            let mut hasher = DefaultHasher::new();
2128            content.hash(&mut hasher);
2129            let h = hasher.finish();
2130            if !seen.insert(h) {
2131                continue;
2132            }
2133
2134            let truncated = if content.len() > MAX_PER_FILE {
2135                format!("{}...[truncated]", &content[..MAX_PER_FILE])
2136            } else {
2137                content
2138            };
2139
2140            if total_chars + truncated.len() > MAX_TOTAL {
2141                break;
2142            }
2143            total_chars += truncated.len();
2144            result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2145        }
2146        match dir.parent().map(|p| p.to_owned()) {
2147            Some(p) => dir = p,
2148            None => break,
2149        }
2150    }
2151
2152    if result.is_empty() {
2153        return String::new();
2154    }
2155    format!("\n\n# Project Instructions\n{}", result)
2156}
2157
2158pub fn extract_think_block(text: &str) -> Option<String> {
2159    let lower = text.to_lowercase();
2160
2161    // Official Gemma-4 Native Tags
2162    let open_tag = "<|channel>thought";
2163    let close_tag = "<channel|>";
2164
2165    let start_pos = lower.find(open_tag)?;
2166    let content_start = start_pos + open_tag.len();
2167
2168    let close_pos = lower[content_start..]
2169        .find(close_tag)
2170        .map(|p| content_start + p)
2171        .unwrap_or(text.len());
2172
2173    let content = text[content_start..close_pos].trim();
2174    if content.is_empty() {
2175        None
2176    } else {
2177        Some(content.to_string())
2178    }
2179}
2180
2181pub fn strip_think_blocks(text: &str) -> String {
2182    // Fast-path: strip a stray </think> the model emits at the start when it skips
2183    // the opening tag (common with Qwen after tool calls). Strip it before the lower
2184    // allocation so it can't slip through any branch below.
2185    let text = {
2186        let t = text.trim_start();
2187        if t.to_lowercase().starts_with("</think>") {
2188            &t[8..]
2189        } else {
2190            text
2191        }
2192    };
2193
2194    let lower = text.to_lowercase();
2195
2196    // Use the official Gemma-4 closing tag — answer is everything after it.
2197    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2198        let answer = text[end..]
2199            .replace("<|channel>thought", "")
2200            .replace("<channel|>", "");
2201        return answer.trim().replace("\n\n\n", "\n\n").to_string();
2202    }
2203
2204    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
2205    let first_open = [
2206        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
2207        lower.find("<think>"),
2208        lower.find("<thought>"),
2209        lower.find("<|think|>"),
2210    ]
2211    .iter()
2212    .filter_map(|&x| x)
2213    .min();
2214
2215    if let Some(start) = first_open {
2216        if start > 0 {
2217            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2218        }
2219        return String::new();
2220    }
2221
2222    // If the model outputs 'naked' reasoning without tags:
2223    // Strip leading sentences like "The user asked..." or "I should present..."
2224    // if they appear before actual answer content.
2225    let naked_reasoning_phrases: &[&str] = &[
2226        "the user asked",
2227        "the user is asking",
2228        "the user wants",
2229        "i will structure",
2230        "i should provide",
2231        "i should give",
2232        "i should avoid",
2233        "i should note",
2234        "i should focus",
2235        "i should keep",
2236        "i should respond",
2237        "i should present",
2238        "i should display",
2239        "i should show",
2240        "i need to",
2241        "i can see from",
2242        "without being overly",
2243        "let me ",
2244        "necessary information in my identity",
2245        "was computed successfully",
2246        "computed successfully",
2247    ];
2248    let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2249    if is_naked_reasoning {
2250        let lines: Vec<&str> = text.lines().collect();
2251        if !lines.is_empty() {
2252            // Skip leading lines that are themselves reasoning prose or blank.
2253            // Stop skipping at the first line that looks like real answer content.
2254            let mut start_idx = 0;
2255            for (i, line) in lines.iter().enumerate() {
2256                let l = line.to_lowercase();
2257                let is_reasoning_line =
2258                    naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2259                if is_reasoning_line {
2260                    start_idx = i + 1;
2261                } else {
2262                    break;
2263                }
2264            }
2265            if start_idx < lines.len() {
2266                return lines[start_idx..]
2267                    .join("\n")
2268                    .trim()
2269                    .replace("\n\n\n", "\n\n")
2270                    .to_string();
2271            }
2272            // Entire response was reasoning prose — return empty.
2273            return String::new();
2274        }
2275    }
2276
2277    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
2278    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
2279    let cleaned = strip_xml_tool_call_artifacts(text);
2280    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2281}
2282
2283/// Remove stray XML tool-call closing/opening tags that local models occasionally
2284/// leak into visible output when they start-then-abandon a tool call.
2285fn strip_xml_tool_call_artifacts(text: &str) -> String {
2286    // Tags to remove (both open and close forms, case-insensitive).
2287    const XML_ARTIFACTS: &[&str] = &[
2288        "</tool_call>",
2289        "<tool_call>",
2290        "</function>",
2291        "<function>",
2292        "</parameter>",
2293        "<parameter>",
2294        "</arguments>",
2295        "<arguments>",
2296        "</tool_use>",
2297        "<tool_use>",
2298        "</invoke>",
2299        "<invoke>",
2300        // Stray think/reasoning closing tags that leak after block extraction.
2301        "</think>",
2302        "</thought>",
2303        "</thinking>",
2304    ];
2305    let mut out = text.to_string();
2306    for tag in XML_ARTIFACTS {
2307        // Case-insensitive replace
2308        while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2309            out.drain(pos..pos + tag.len());
2310        }
2311    }
2312    // Collapse any blank lines left behind
2313    out
2314}
2315
2316/// Extract native Gemma-4 <|tool_call|> tags from text.
2317/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
2318pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2319    use regex::Regex;
2320    let mut results = Vec::new();
2321
2322    // Regex to find the tool call block
2323    // Formats supported:
2324    // <|tool_call|>call:func_name{args}<tool_call|>
2325    // <|tool_call>call:func_name{args}[END_TOOL_REQUEST]
2326    // <|tool_call>call:func_name{args}<tool_call|>
2327    let re_call = Regex::new(
2328        r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2329    ).unwrap();
2330    // Regex to find arguments inside the braces
2331    // Handles <|"|> wrappers and plain values
2332    let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2333
2334    for cap in re_call.captures_iter(text) {
2335        let name = cap[1].to_string();
2336        let args_str = &cap[2];
2337        let mut arguments = serde_json::Map::new();
2338
2339        for arg_cap in re_arg.captures_iter(args_str) {
2340            let key = arg_cap[1].to_string();
2341            // arg_cap[2] is the <|"|> wrapped value, arg_cap[3] is the plain value
2342            let val_raw = arg_cap
2343                .get(2)
2344                .map(|m| m.as_str())
2345                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2346                .unwrap_or("")
2347                .trim();
2348            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2349
2350            // Try to parse as JSON types (bool, number), otherwise string
2351            let val = if normalized_raw == "true" {
2352                Value::Bool(true)
2353            } else if normalized_raw == "false" {
2354                Value::Bool(false)
2355            } else if let Ok(n) = normalized_raw.parse::<i64>() {
2356                Value::Number(n.into())
2357            } else if let Ok(n) = normalized_raw.parse::<u64>() {
2358                Value::Number(n.into())
2359            } else if let Ok(n) = normalized_raw.parse::<f64>() {
2360                serde_json::Number::from_f64(n)
2361                    .map(Value::Number)
2362                    .unwrap_or(Value::String(normalized_raw.clone()))
2363            } else {
2364                Value::String(normalized_raw)
2365            };
2366
2367            arguments.insert(key, val);
2368        }
2369
2370        results.push(ToolCallResponse {
2371            id: format!("call_{}", rand::random::<u32>()),
2372            call_type: "function".to_string(),
2373            function: ToolCallFn {
2374                name,
2375                arguments: Value::Object(arguments).to_string(),
2376            },
2377        });
2378    }
2379
2380    results
2381}
2382
2383pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2384    let trimmed = raw.trim();
2385    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2386
2387    let mut value = match serde_json::from_str::<Value>(&candidate) {
2388        Ok(v) => v,
2389        Err(_) => return candidate,
2390    };
2391    normalize_tool_argument_value(tool_name, &mut value);
2392    value.to_string()
2393}
2394
2395fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2396    match value {
2397        Value::String(s) => *s = normalize_string_arg(s),
2398        Value::Array(items) => {
2399            for item in items {
2400                normalize_tool_argument_value(tool_name, item);
2401            }
2402        }
2403        Value::Object(map) => {
2404            for val in map.values_mut() {
2405                normalize_tool_argument_value(tool_name, val);
2406            }
2407            if tool_name == "grep_files" {
2408                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2409                    *pattern = normalize_regex_pattern(pattern);
2410                }
2411            }
2412            for key in ["path", "extension", "query", "command", "reason"] {
2413                if let Some(Value::String(s)) = map.get_mut(key) {
2414                    *s = normalize_string_arg(s);
2415                }
2416            }
2417        }
2418        _ => {}
2419    }
2420}
2421
2422fn unwrap_json_string_once(input: &str) -> Option<String> {
2423    if input.len() < 2 {
2424        return None;
2425    }
2426    let first = input.chars().next()?;
2427    let last = input.chars().last()?;
2428    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2429        return None;
2430    }
2431    let inner = &input[1..input.len() - 1];
2432    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2433    Some(unescaped.trim().to_string())
2434}
2435
2436fn normalize_string_arg(input: &str) -> String {
2437    let mut out = input.trim().to_string();
2438    while out.len() >= 2 {
2439        let mut changed = false;
2440        for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2441            if out.starts_with(start) && out.ends_with(end) {
2442                out = out[start.len()..out.len() - end.len()].trim().to_string();
2443                changed = true;
2444                break;
2445            }
2446        }
2447        if !changed {
2448            break;
2449        }
2450    }
2451    out
2452}
2453
2454fn normalize_regex_pattern(input: &str) -> String {
2455    let out = normalize_string_arg(input);
2456    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2457        out[1..out.len() - 1].to_string()
2458    } else {
2459        out
2460    }
2461}
2462
2463fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2464    let mut system_blocks = Vec::new();
2465    let mut prepared = Vec::new();
2466    let mut seeded = false;
2467
2468    for message in messages {
2469        if message.role == "system" {
2470            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2471                .trim()
2472                .to_string();
2473            if !cleaned.is_empty() {
2474                system_blocks.push(cleaned);
2475            }
2476            continue;
2477        }
2478
2479        let mut clone = message.clone();
2480        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2481
2482        if !seeded && message.role == "user" {
2483            let mut merged = String::new();
2484            if !system_blocks.is_empty() {
2485                merged.push_str("System instructions for this turn:\n");
2486                merged.push_str(&system_blocks.join("\n\n"));
2487                merged.push_str("\n\n");
2488            }
2489            merged.push_str(clone.content.as_str());
2490            clone.content = MessageContent::Text(merged);
2491            seeded = true;
2492        }
2493
2494        prepared.push(clone);
2495    }
2496
2497    if !seeded && !system_blocks.is_empty() {
2498        prepared.insert(
2499            0,
2500            ChatMessage::user(&format!(
2501                "System instructions for this turn:\n{}",
2502                system_blocks.join("\n\n")
2503            )),
2504        );
2505    }
2506
2507    prepared
2508}
2509
2510fn strip_legacy_turn_wrappers(text: &str) -> String {
2511    text.replace("<|turn>system\n", "")
2512        .replace("<|turn>user\n", "")
2513        .replace("<|turn>assistant\n", "")
2514        .replace("<|turn>tool\n", "")
2515        .replace("<turn|>", "")
2516        .trim()
2517        .to_string()
2518}
2519
2520pub fn strip_native_tool_call_text(text: &str) -> String {
2521    use regex::Regex;
2522    let re_call = Regex::new(
2523        r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2524    ).unwrap();
2525    let re_response =
2526        Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2527            .unwrap();
2528    let without_calls = re_call.replace_all(text, "");
2529    re_response
2530        .replace_all(without_calls.as_ref(), "")
2531        .trim()
2532        .to_string()
2533}
2534
2535#[cfg(test)]
2536mod tests {
2537    use super::*;
2538
2539    #[test]
2540    fn system_prompt_includes_running_hematite_version() {
2541        let engine = InferenceEngine::new(
2542            "http://localhost:1234/v1".to_string(),
2543            "strategist".to_string(),
2544            0,
2545        )
2546        .expect("engine");
2547
2548        let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2549        assert!(system.contains(crate::HEMATITE_VERSION));
2550    }
2551
2552    #[test]
2553    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2554        let text = r#"<|channel>thought
2555Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2556<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2557
2558        let calls = extract_native_tool_calls(text);
2559        assert_eq!(calls.len(), 1);
2560        assert_eq!(calls[0].function.name, "read_file");
2561
2562        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2563        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2564        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2565        assert_eq!(
2566            args.get("path").and_then(|v| v.as_str()),
2567            Some("src/ui/tui.rs")
2568        );
2569
2570        let stripped = strip_native_tool_call_text(text);
2571        assert!(!stripped.contains("<|tool_call"));
2572        assert!(!stripped.contains("<tool_call|>"));
2573    }
2574
2575    #[test]
2576    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2577        let text = r#"<|channel>thought
2578Planning.
2579<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2580Mapped src.
2581<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2582Read main.
2583<channel|>"#;
2584
2585        let calls = extract_native_tool_calls(text);
2586        assert_eq!(calls.len(), 2);
2587        assert_eq!(calls[0].function.name, "list_files");
2588        assert_eq!(calls[1].function.name, "read_file");
2589
2590        let stripped = strip_native_tool_call_text(text);
2591        assert!(!stripped.contains("<|tool_call"));
2592        assert!(!stripped.contains("<|tool_response"));
2593        assert!(!stripped.contains("<tool_response|>"));
2594    }
2595}
hematite/agent/inference.rs

hematite/agent/
inference.rs