hematite/agent/
inference.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7// ── Engine ────────────────────────────────────────────────────────────────────
8
9pub struct InferenceEngine {
10    pub client: reqwest::Client,
11    pub api_url: String,
12    /// Root URL of the LLM provider (e.g. `http://localhost:1234`).
13    /// All non-completions endpoints (models list, health, embeddings) are derived from this.
14    pub base_url: String,
15    pub species: String,
16    pub snark: u8,
17    pub kv_semaphore: Semaphore,
18    /// The model ID currently loaded in LM Studio (auto-detected on boot).
19    pub model: std::sync::RwLock<String>,
20    /// Context window length in tokens (auto-detected from LM Studio, default 32768).
21    pub context_length: std::sync::atomic::AtomicUsize,
22    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23    /// Optional model ID for worker-level tasks (Swarms / research).
24    pub worker_model: Option<String>,
25    /// Opt-in Gemma-native request shaping. Off by default.
26    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27    /// Global cancellation token for hard-interrupting the inference stream.
28    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32    let lower = model.to_ascii_lowercase();
33    lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37    is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40// ── OpenAI Tool Definition ────────────────────────────────────────────────────
41
42#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44    #[serde(rename = "type")]
45    pub tool_type: String,
46    pub function: ToolFunction,
47    #[serde(skip_serializing, skip_deserializing)]
48    pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53    pub name: String,
54    pub description: String,
55    pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60    RepoRead,
61    RepoWrite,
62    Runtime,
63    Architecture,
64    Toolchain,
65    Verification,
66    Git,
67    Research,
68    Vision,
69    Lsp,
70    Workflow,
71    External,
72    Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77    pub category: ToolCategory,
78    pub mutates_workspace: bool,
79    pub external_surface: bool,
80    pub trust_sensitive: bool,
81    pub read_only_friendly: bool,
82    pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86    if name.starts_with("mcp__") {
87        let lower = name.to_ascii_lowercase();
88        let mutates_workspace = [
89            "__edit",
90            "__write",
91            "__create",
92            "__move",
93            "__delete",
94            "__remove",
95            "__rename",
96            "__replace",
97            "__patch",
98        ]
99        .iter()
100        .any(|needle| lower.contains(needle));
101        return ToolMetadata {
102            category: ToolCategory::External,
103            mutates_workspace,
104            external_surface: true,
105            trust_sensitive: true,
106            read_only_friendly: !mutates_workspace,
107            plan_scope: false,
108        };
109    }
110
111    match name {
112        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113            category: ToolCategory::RepoRead,
114            mutates_workspace: false,
115            external_surface: false,
116            trust_sensitive: false,
117            read_only_friendly: true,
118            plan_scope: true,
119        },
120        "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121            category: ToolCategory::RepoWrite,
122            mutates_workspace: true,
123            external_surface: false,
124            trust_sensitive: true,
125            read_only_friendly: false,
126            plan_scope: true,
127        },
128        "trace_runtime_flow" => ToolMetadata {
129            category: ToolCategory::Architecture,
130            mutates_workspace: false,
131            external_surface: false,
132            trust_sensitive: false,
133            read_only_friendly: true,
134            plan_scope: false,
135        },
136        "describe_toolchain" => ToolMetadata {
137            category: ToolCategory::Toolchain,
138            mutates_workspace: false,
139            external_surface: false,
140            trust_sensitive: false,
141            read_only_friendly: true,
142            plan_scope: false,
143        },
144        "shell" => ToolMetadata {
145            category: ToolCategory::Runtime,
146            mutates_workspace: true,
147            external_surface: false,
148            trust_sensitive: true,
149            read_only_friendly: false,
150            plan_scope: false,
151        },
152        "inspect_host" => ToolMetadata {
153            category: ToolCategory::Runtime,
154            mutates_workspace: false,
155            external_surface: false,
156            trust_sensitive: false,
157            read_only_friendly: true,
158            plan_scope: false,
159        },
160        "resolve_host_issue" => ToolMetadata {
161            category: ToolCategory::Runtime,
162            mutates_workspace: true,
163            external_surface: true,
164            trust_sensitive: true,
165            read_only_friendly: false,
166            plan_scope: false,
167        },
168        "run_hematite_maintainer_workflow" => ToolMetadata {
169            category: ToolCategory::Workflow,
170            mutates_workspace: true,
171            external_surface: false,
172            trust_sensitive: true,
173            read_only_friendly: false,
174            plan_scope: false,
175        },
176        "run_workspace_workflow" => ToolMetadata {
177            category: ToolCategory::Workflow,
178            mutates_workspace: true,
179            external_surface: false,
180            trust_sensitive: true,
181            read_only_friendly: false,
182            plan_scope: false,
183        },
184        "verify_build" => ToolMetadata {
185            category: ToolCategory::Verification,
186            mutates_workspace: false,
187            external_surface: false,
188            trust_sensitive: false,
189            read_only_friendly: true,
190            plan_scope: false,
191        },
192        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
193            ToolMetadata {
194                category: ToolCategory::Git,
195                mutates_workspace: true,
196                external_surface: false,
197                trust_sensitive: true,
198                read_only_friendly: false,
199                plan_scope: false,
200            }
201        }
202        "research_web" | "fetch_docs" => ToolMetadata {
203            category: ToolCategory::Research,
204            mutates_workspace: false,
205            external_surface: false,
206            trust_sensitive: false,
207            read_only_friendly: true,
208            plan_scope: false,
209        },
210        "vision_analyze" => ToolMetadata {
211            category: ToolCategory::Vision,
212            mutates_workspace: false,
213            external_surface: false,
214            trust_sensitive: false,
215            read_only_friendly: true,
216            plan_scope: false,
217        },
218        "lsp_definitions"
219        | "lsp_references"
220        | "lsp_hover"
221        | "lsp_rename_symbol"
222        | "lsp_get_diagnostics"
223        | "lsp_search_symbol" => ToolMetadata {
224            category: ToolCategory::Lsp,
225            mutates_workspace: false,
226            external_surface: false,
227            trust_sensitive: false,
228            read_only_friendly: true,
229            plan_scope: false,
230        },
231        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
232            category: ToolCategory::Workflow,
233            mutates_workspace: false,
234            external_surface: false,
235            trust_sensitive: false,
236            read_only_friendly: true,
237            plan_scope: true,
238        },
239        "manage_tasks" => ToolMetadata {
240            category: ToolCategory::Workflow,
241            mutates_workspace: false,
242            external_surface: false,
243            trust_sensitive: false,
244            read_only_friendly: true,
245            plan_scope: false,
246        },
247        _ => ToolMetadata {
248            category: ToolCategory::Other,
249            mutates_workspace: false,
250            external_surface: false,
251            trust_sensitive: false,
252            read_only_friendly: true,
253            plan_scope: false,
254        },
255    }
256}
257
258// ── Message types ─────────────────────────────────────────────────────────────
259
260/// OpenAI-compatible chat message. Content can be a string (legacy) or a
261/// Vec of ContentPart (multimodal).
262#[derive(Serialize, Deserialize, Clone, Debug)]
263pub struct ChatMessage {
264    pub role: String,
265    /// Support both simple string content and complex multi-part content (Vision).
266    pub content: MessageContent,
267    /// Assistant messages may have tool calls. Default to empty vec, not null.
268    #[serde(default, skip_serializing_if = "Vec::is_empty")]
269    pub tool_calls: Vec<ToolCallResponse>,
270    /// Tool message references the original call.
271    #[serde(skip_serializing_if = "Option::is_none")]
272    pub tool_call_id: Option<String>,
273    /// Tool message name.
274    #[serde(skip_serializing_if = "Option::is_none")]
275    pub name: Option<String>,
276}
277
278#[derive(Serialize, Deserialize, Clone, Debug)]
279#[serde(untagged)]
280pub enum MessageContent {
281    Text(String),
282    Parts(Vec<ContentPart>),
283}
284
285#[derive(Serialize, Deserialize, Clone, Debug)]
286#[serde(tag = "type")]
287pub enum ContentPart {
288    #[serde(rename = "text")]
289    Text { text: String },
290    #[serde(rename = "image_url")]
291    ImageUrl { image_url: ImageUrlSource },
292}
293
294#[derive(Serialize, Deserialize, Clone, Debug)]
295pub struct ImageUrlSource {
296    pub url: String,
297}
298
299impl Default for MessageContent {
300    fn default() -> Self {
301        MessageContent::Text(String::new())
302    }
303}
304
305impl MessageContent {
306    pub fn as_str(&self) -> &str {
307        match self {
308            MessageContent::Text(s) => s,
309            MessageContent::Parts(parts) => {
310                for part in parts {
311                    if let ContentPart::Text { text } = part {
312                        return text;
313                    }
314                }
315                ""
316            }
317        }
318    }
319}
320
321impl ChatMessage {
322    pub fn system(content: &str) -> Self {
323        Self {
324            role: "system".into(),
325            content: MessageContent::Text(content.into()),
326            tool_calls: Vec::new(),
327            tool_call_id: None,
328            name: None,
329        }
330    }
331    pub fn user(content: &str) -> Self {
332        Self {
333            role: "user".into(),
334            content: MessageContent::Text(content.into()),
335            tool_calls: Vec::new(),
336            tool_call_id: None,
337            name: None,
338        }
339    }
340    pub fn user_with_image(text: &str, image_url: &str) -> Self {
341        let mut text_parts = text.to_string();
342        if !text_parts.contains("<|image|>") {
343            text_parts.push_str(" <|image|>");
344        }
345        Self {
346            role: "user".into(),
347            content: MessageContent::Parts(vec![
348                ContentPart::Text { text: text_parts },
349                ContentPart::ImageUrl {
350                    image_url: ImageUrlSource {
351                        url: image_url.into(),
352                    },
353                },
354            ]),
355            tool_calls: Vec::new(),
356            tool_call_id: None,
357            name: None,
358        }
359    }
360    pub fn assistant_text(content: &str) -> Self {
361        Self {
362            role: "assistant".into(),
363            content: MessageContent::Text(content.into()),
364            tool_calls: Vec::new(),
365            tool_call_id: None,
366            name: None,
367        }
368    }
369    pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
370        Self {
371            role: "assistant".into(),
372            content: MessageContent::Text(content.into()),
373            tool_calls: calls,
374            tool_call_id: None,
375            name: None,
376        }
377    }
378    pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
379        Self::tool_result_for_model(tool_call_id, fn_name, content, "")
380    }
381
382    /// Build a tool result message, applying Gemma 4 native markup only when the
383    /// loaded model is actually a Gemma 4 model.
384    pub fn tool_result_for_model(
385        tool_call_id: &str,
386        fn_name: &str,
387        content: &str,
388        model: &str,
389    ) -> Self {
390        let body = if is_gemma4_model_name(model) {
391            format!(
392                "<|tool_response>response:{}{}{}<tool_response|>",
393                fn_name, "{", content
394            )
395        } else {
396            content.to_string()
397        };
398        Self {
399            role: "tool".into(),
400            content: MessageContent::Text(body),
401            tool_calls: Vec::new(),
402            tool_call_id: Some(tool_call_id.into()),
403            name: Some(fn_name.into()),
404        }
405    }
406}
407
408// ── Tool call as returned by the model ───────────────────────────────────────
409
410#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallResponse {
412    pub id: String,
413    #[serde(rename = "type")]
414    pub call_type: String,
415    pub function: ToolCallFn,
416}
417
418#[derive(Serialize, Deserialize, Clone, Debug)]
419pub struct ToolCallFn {
420    pub name: String,
421    /// JSON-encoded arguments string (as returned by the API).
422    pub arguments: String,
423}
424
425// ── HTTP request / response shapes ───────────────────────────────────────────
426
427#[derive(Serialize)]
428struct ChatRequest {
429    model: String,
430    messages: Vec<ChatMessage>,
431    temperature: f32,
432    stream: bool,
433    #[serde(skip_serializing_if = "Option::is_none")]
434    tools: Option<Vec<ToolDefinition>>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ChatResponse {
439    choices: Vec<ResponseChoice>,
440    usage: Option<TokenUsage>,
441}
442
443#[derive(Deserialize, Debug, Clone)]
444pub struct TokenUsage {
445    pub prompt_tokens: usize,
446    pub completion_tokens: usize,
447    pub total_tokens: usize,
448    #[serde(default)]
449    pub prompt_cache_hit_tokens: usize,
450    #[serde(default)]
451    pub cache_read_input_tokens: usize,
452}
453
454#[derive(Deserialize, Debug)]
455struct ResponseChoice {
456    message: ResponseMessage,
457    #[serde(default)]
458    finish_reason: Option<String>,
459}
460
461#[derive(Deserialize, Debug)]
462struct ResponseMessage {
463    content: Option<String>,
464    tool_calls: Option<Vec<ToolCallResponse>>,
465    /// LM Studio routes Qwen3 thinking-mode output here instead of wrapping
466    /// it in <think> tags inside `content`. When tool calls are generated
467    /// inside a think block, they end up here rather than in `tool_calls`.
468    #[serde(default)]
469    reasoning_content: Option<String>,
470}
471
472const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
473const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
474
475fn is_tiny_context_window(context_length: usize) -> bool {
476    context_length <= 8_192
477}
478
479fn is_compact_context_window(context_length: usize) -> bool {
480    context_length > 8_192 && context_length <= 49_152
481}
482
483pub fn is_compact_context_window_pub(context_length: usize) -> bool {
484    is_compact_context_window(context_length)
485}
486
487fn is_provider_context_limit_detail(lower: &str) -> bool {
488    (lower.contains("n_keep") && lower.contains("n_ctx"))
489        || lower.contains("context length")
490        || lower.contains("keep from the initial prompt")
491        || lower.contains("prompt is greater than the context length")
492        || lower.contains("exceeds the context window")
493}
494
495fn classify_runtime_failure_tag(detail: &str) -> &'static str {
496    let lower = detail.to_ascii_lowercase();
497    if lower.contains("context_window_blocked")
498        || lower.contains("context ceiling reached")
499        || lower.contains("exceeds the")
500        || is_provider_context_limit_detail(&lower)
501    {
502        "context_window"
503    } else if lower.contains("empty response from model")
504        || lower.contains("model returned an empty response")
505    {
506        "empty_model_response"
507    } else if lower.contains("action blocked:")
508        || lower.contains("access denied")
509        || lower.contains("declined by user")
510    {
511        "tool_policy_blocked"
512    } else {
513        "provider_degraded"
514    }
515}
516
517fn runtime_failure_guidance(tag: &str) -> &'static str {
518    match tag {
519        "context_window" => {
520            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
521        }
522        "empty_model_response" => {
523            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
524        }
525        "tool_policy_blocked" => {
526            "Stay inside the allowed workflow or switch modes before retrying."
527        }
528        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
529    }
530}
531
532fn format_runtime_failure_message(detail: &str) -> String {
533    let tag = classify_runtime_failure_tag(detail);
534    format!(
535        "[failure:{}] {} Detail: {}",
536        tag,
537        runtime_failure_guidance(tag),
538        detail.trim()
539    )
540}
541
542#[derive(Debug, Clone, Copy, PartialEq, Eq)]
543pub enum ProviderRuntimeState {
544    Booting,
545    Live,
546    Recovering,
547    Degraded,
548    ContextWindow,
549    EmptyResponse,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum McpRuntimeState {
554    Unconfigured,
555    Healthy,
556    Degraded,
557    Failed,
558}
559
560#[derive(Debug, Clone, Copy, PartialEq, Eq)]
561pub enum OperatorCheckpointState {
562    Idle,
563    RecoveringProvider,
564    BudgetReduced,
565    HistoryCompacted,
566    BlockedContextWindow,
567    BlockedPolicy,
568    BlockedRecentFileEvidence,
569    BlockedExactLineWindow,
570    BlockedToolLoop,
571    BlockedVerification,
572}
573
574impl OperatorCheckpointState {
575    pub fn label(self) -> &'static str {
576        match self {
577            OperatorCheckpointState::Idle => "idle",
578            OperatorCheckpointState::RecoveringProvider => "recovering_provider",
579            OperatorCheckpointState::BudgetReduced => "budget_reduced",
580            OperatorCheckpointState::HistoryCompacted => "history_compacted",
581            OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
582            OperatorCheckpointState::BlockedPolicy => "blocked_policy",
583            OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
584            OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
585            OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
586            OperatorCheckpointState::BlockedVerification => "blocked_verification",
587        }
588    }
589}
590
591fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
592    match tag {
593        "context_window" => ProviderRuntimeState::ContextWindow,
594        "empty_model_response" => ProviderRuntimeState::EmptyResponse,
595        _ => ProviderRuntimeState::Degraded,
596    }
597}
598
599fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
600    match tag {
601        "context_window" => {
602            "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
603                .to_string()
604        }
605        "empty_model_response" => {
606            "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
607                .to_string()
608        }
609        "tool_policy_blocked" => {
610            "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
611                .to_string()
612        }
613        _ => {
614            let mut excerpt = detail
615                .split_whitespace()
616                .take(12)
617                .collect::<Vec<_>>()
618                .join(" ");
619            if excerpt.len() > 110 {
620                excerpt.truncate(110);
621                excerpt.push_str("...");
622            }
623            if excerpt.is_empty() {
624                "LM Studio degraded; Hematite will retry once before surfacing a failure."
625                    .to_string()
626            } else {
627                format!("LM Studio degraded: {}", excerpt)
628            }
629        }
630    }
631}
632
633// ── Events pushed to the TUI ──────────────────────────────────────────────────
634
635#[derive(Debug)]
636pub enum InferenceEvent {
637    /// A text token to append to the current assistant message.
638    Token(String),
639    /// A text token to be displayed on screen but NOT spoken (e.g. startup greeting).
640    MutedToken(String),
641    /// Internal model reasoning (shown in side panel, not dialogue).
642    Thought(String),
643    /// Critical diagnostic feedback from the voice synthesis engine.
644    VoiceStatus(String),
645    /// A tool call is starting – show a status line in the TUI.
646    ToolCallStart {
647        id: String,
648        name: String,
649        args: String,
650    },
651    /// A tool call completed – show result in the TUI.
652    ToolCallResult {
653        id: String,
654        name: String,
655        output: String,
656        is_error: bool,
657    },
658    /// A risky tool requires explicit user approval.
659    /// The TUI must send `true` (approved) or `false` (rejected) via `responder`.
660    /// When `diff` is Some, the modal renders a coloured before/after diff preview.
661    ApprovalRequired {
662        id: String,
663        name: String,
664        display: String,
665        /// Pre-formatted diff: lines starting with "- " are removals, "+ " are additions,
666        /// "---" is a file header.  None means a plain high-risk approval (no diff).
667        diff: Option<String>,
668        responder: tokio::sync::oneshot::Sender<bool>,
669    },
670    /// The current agent turn is complete.
671    Done,
672    /// An error occurred during inference.
673    Error(String),
674    /// Compact provider/runtime state for the operator surface.
675    ProviderStatus {
676        state: ProviderRuntimeState,
677        summary: String,
678    },
679    /// Typed operator checkpoint/blocker state for SPECULAR and recovery UIs.
680    OperatorCheckpoint {
681        state: OperatorCheckpointState,
682        summary: String,
683    },
684    /// Typed recovery recipe summary for operator/debug surfaces.
685    RecoveryRecipe { summary: String },
686    /// Compact MCP/runtime server health for the operator surface.
687    McpStatus {
688        state: McpRuntimeState,
689        summary: String,
690    },
691    /// Current compaction pressure against the adaptive threshold.
692    CompactionPressure {
693        estimated_tokens: usize,
694        threshold_tokens: usize,
695        percent: u8,
696    },
697    /// Current total prompt-budget pressure against the live context window.
698    PromptPressure {
699        estimated_input_tokens: usize,
700        reserved_output_tokens: usize,
701        estimated_total_tokens: usize,
702        context_length: usize,
703        percent: u8,
704    },
705    /// A generic task progress update (e.g. for single-agent tool execution).
706    TaskProgress {
707        id: String,
708        label: String,
709        progress: u8,
710    },
711    /// Real-time token usage update from the API.
712    UsageUpdate(TokenUsage),
713    /// The current runtime profile detected from LM Studio.
714    RuntimeProfile {
715        model_id: String,
716        context_length: usize,
717    },
718    /// Vein index status after each incremental re-index.
719    VeinStatus {
720        file_count: usize,
721        embedded_count: usize,
722        docs_only: bool,
723    },
724    /// File paths the Vein surfaced as relevant to the current turn.
725    /// Used to populate ACTIVE CONTEXT with retrieval results.
726    VeinContext { paths: Vec<String> },
727    /// A new companion was hatched mid-session via /reroll.
728    SoulReroll {
729        species: String,
730        rarity: String,
731        shiny: bool,
732        personality: String,
733    },
734    /// Embed model loaded/unloaded mid-session.
735    EmbedProfile { model_id: Option<String> },
736    /// A single line of live shell output, streamed while the command runs.
737    /// Displayed in the SPECULAR panel so the operator sees progress without
738    /// waiting for the full command to finish.
739    ShellLine(String),
740}
741
742// ── Engine implementation ─────────────────────────────────────────────────────
743
744impl InferenceEngine {
745    pub fn new(
746        api_url: String,
747        species: String,
748        snark: u8,
749    ) -> Result<Self, Box<dyn std::error::Error>> {
750        let client = reqwest::Client::builder()
751            .timeout(std::time::Duration::from_secs(180))
752            .build()?;
753
754        // Extract http://host:port as the base for all non-completions endpoints.
755        let base_url = {
756            let trimmed = api_url.trim_end_matches('/');
757            if let Some(scheme_end) = trimmed.find("://") {
758                let after_scheme = &trimmed[scheme_end + 3..];
759                if let Some(path_start) = after_scheme.find('/') {
760                    format!(
761                        "{}://{}",
762                        &trimmed[..scheme_end],
763                        &after_scheme[..path_start]
764                    )
765                } else {
766                    trimmed.to_string()
767                }
768            } else {
769                trimmed.to_string()
770            }
771        };
772
773        let api_url = if api_url.ends_with("/chat/completions") {
774            api_url
775        } else if api_url.ends_with("/") {
776            format!("{}chat/completions", api_url)
777        } else {
778            format!("{}/chat/completions", api_url)
779        };
780
781        Ok(Self {
782            client,
783            api_url,
784            base_url,
785            species,
786            snark,
787            kv_semaphore: Semaphore::new(3),
788            model: std::sync::RwLock::new(String::new()),
789            context_length: std::sync::atomic::AtomicUsize::new(32_768), // Gemma-4 Sweet Spot (32K)
790            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
791            worker_model: None,
792            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
793            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
794        })
795    }
796
797    pub fn set_gemma_native_formatting(&self, enabled: bool) {
798        self.gemma_native_formatting
799            .store(enabled, std::sync::atomic::Ordering::SeqCst);
800    }
801
802    pub fn gemma_native_formatting_enabled(&self) -> bool {
803        self.gemma_native_formatting
804            .load(std::sync::atomic::Ordering::SeqCst)
805    }
806
807    pub fn current_model(&self) -> String {
808        self.model.read().map(|g| g.clone()).unwrap_or_default()
809    }
810
811    pub fn current_context_length(&self) -> usize {
812        self.context_length
813            .load(std::sync::atomic::Ordering::SeqCst)
814    }
815
816    pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
817        if let Ok(mut guard) = self.model.write() {
818            *guard = model.to_string();
819        }
820        self.context_length
821            .store(context_length, std::sync::atomic::Ordering::SeqCst);
822    }
823
824    /// Returns true if LM Studio is reachable.
825    pub async fn health_check(&self) -> bool {
826        let url = format!("{}/v1/models", self.base_url);
827        match self.client.get(&url).send().await {
828            Ok(resp) => resp.status().is_success(),
829            Err(_) => false,
830        }
831    }
832
833    /// Query /api/v0/models and return the first loaded chat model id.
834    /// Uses /api/v0/models (not /v1/models) because the OpenAI-compat endpoint
835    /// omits the `type` field, making it impossible to distinguish embedding
836    /// models from chat models. Falls back to /v1/models with a name heuristic
837    /// if /api/v0/models is unavailable.
838    /// Returns Some("") when LM Studio is reachable but no chat model is loaded
839    /// so callers can distinguish "offline" (None) from "no chat model" (Some("")).
840    pub async fn get_loaded_model(&self) -> Option<String> {
841        #[derive(Deserialize)]
842        struct ModelList {
843            data: Vec<ModelEntry>,
844        }
845        #[derive(Deserialize)]
846        struct ModelEntry {
847            id: String,
848            #[serde(rename = "type", default)]
849            model_type: String,
850            #[serde(default)]
851            state: String,
852        }
853
854        // Try /api/v0/models first — it has type and state fields.
855        if let Ok(resp) = self
856            .client
857            .get(format!("{}/api/v0/models", self.base_url))
858            .send()
859            .await
860        {
861            if let Ok(list) = resp.json::<ModelList>().await {
862                let chat_model = list
863                    .data
864                    .into_iter()
865                    .find(|m| m.model_type != "embeddings" && m.state == "loaded")
866                    .map(|m| m.id)
867                    .unwrap_or_default();
868                return Some(chat_model);
869            }
870        }
871
872        // Fallback: /v1/models lacks type info — use name heuristic to skip embed models.
873        let resp = self
874            .client
875            .get(format!("{}/v1/models", self.base_url))
876            .send()
877            .await
878            .ok()?;
879        let list: ModelList = resp.json().await.ok()?;
880        Some(
881            list.data
882                .into_iter()
883                .find(|m| !m.id.to_lowercase().contains("embed"))
884                .map(|m| m.id)
885                .unwrap_or_default(),
886        )
887    }
888
889    /// Returns the ID of the first loaded embedding model, if any.
890    /// Uses /api/v0/models which includes `type` and `state` fields.
891    /// The OpenAI-compat /v1/models endpoint omits `type` so cannot be used here.
892    /// Accepts any non-empty state (not just "loaded") to handle LM Studio variants
893    /// where the embed model may report a different state string at startup.
894    pub async fn get_embedding_model(&self) -> Option<String> {
895        #[derive(Deserialize)]
896        struct ModelList {
897            data: Vec<ModelEntry>,
898        }
899        #[derive(Deserialize)]
900        struct ModelEntry {
901            id: String,
902            #[serde(rename = "type", default)]
903            model_type: String,
904            #[serde(default)]
905            state: String,
906        }
907        let resp = self
908            .client
909            .get(format!("{}/api/v0/models", self.base_url))
910            .send()
911            .await
912            .ok()?;
913        let list: ModelList = resp.json().await.ok()?;
914        list.data
915            .into_iter()
916            .find(|m| m.model_type == "embeddings" && m.state == "loaded")
917            .map(|m| m.id)
918    }
919
920    /// Detect the loaded model's context window size.
921    /// Tries LM Studio's `/api/v0/models` endpoint first and prefers the loaded
922    /// model's live `loaded_context_length`, then falls back to older
923    /// `context_length` / `max_context_length` style fields.
924    /// Falls back to a heuristic from the model name, then 32K.
925    pub async fn detect_context_length(&self) -> usize {
926        #[derive(Deserialize)]
927        struct LmStudioModel {
928            id: Option<String>,
929            #[serde(rename = "type", default)]
930            model_type: String,
931            state: Option<String>,
932            loaded_context_length: Option<u64>,
933            context_length: Option<u64>,
934            max_context_length: Option<u64>,
935        }
936        #[derive(Deserialize)]
937        struct LmStudioList {
938            data: Vec<LmStudioModel>,
939        }
940
941        // Check api/v0/models (LM Studio specific)
942        if let Ok(resp) = self
943            .client
944            .get(format!("{}/api/v0/models", self.base_url))
945            .send()
946            .await
947        {
948            if let Ok(list) = resp.json::<LmStudioList>().await {
949                let target_model = self.current_model().to_ascii_lowercase();
950                // Never select embedding models for context-length detection.
951                let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
952                let loaded = list
953                    .data
954                    .iter()
955                    .find(|m| {
956                        non_embed(m)
957                            && m.state.as_deref() == Some("loaded")
958                            && m.id
959                                .as_deref()
960                                .map(|id| id.eq_ignore_ascii_case(&target_model))
961                                .unwrap_or(false)
962                    })
963                    .or_else(|| {
964                        list.data
965                            .iter()
966                            .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
967                    })
968                    .or_else(|| {
969                        list.data.iter().find(|m| {
970                            non_embed(m)
971                                && m.id
972                                    .as_deref()
973                                    .map(|id| id.eq_ignore_ascii_case(&target_model))
974                                    .unwrap_or(false)
975                        })
976                    })
977                    .or_else(|| list.data.iter().find(|m| non_embed(m)));
978
979                if let Some(model) = loaded {
980                    if let Some(ctx) = model.loaded_context_length {
981                        if ctx > 0 {
982                            return ctx as usize;
983                        }
984                    }
985                    if let Some(ctx) = model.context_length {
986                        if ctx > 0 {
987                            return ctx as usize;
988                        }
989                    }
990                    if let Some(ctx) = model.max_context_length {
991                        if ctx > 0 && ctx <= 32_768 {
992                            return ctx as usize;
993                        }
994                    }
995                }
996            }
997        }
998
999        // Heuristic fallback:
1000        // If "gemma-4" is detected, we target 32,768 as the baseline standard,
1001        // acknowledging that 131,072 is available for High-Capacity tasks.
1002        if self.current_model().to_lowercase().contains("gemma-4") {
1003            return 32_768;
1004        }
1005
1006        32_768
1007    }
1008
1009    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
1010        let previous_model = self.current_model();
1011        let previous_context = self.current_context_length();
1012
1013        let detected_model = match self.get_loaded_model().await {
1014            Some(m) if !m.is_empty() => m,            // coding model found
1015            Some(_) => "no model loaded".to_string(), // reachable but no coding model
1016            None => previous_model.clone(),           // LM Studio offline
1017        };
1018
1019        if !detected_model.is_empty() && detected_model != previous_model {
1020            if let Ok(mut guard) = self.model.write() {
1021                *guard = detected_model.clone();
1022            }
1023        }
1024
1025        let detected_context = self.detect_context_length().await;
1026        let effective_model = if detected_model.is_empty() {
1027            previous_model.clone()
1028        } else {
1029            detected_model
1030        };
1031
1032        let changed = effective_model != previous_model || detected_context != previous_context;
1033        self.set_runtime_profile(&effective_model, detected_context);
1034
1035        Some((effective_model, detected_context, changed))
1036    }
1037
1038    pub fn build_system_prompt(
1039        &self,
1040        snark: u8,
1041        chaos: u8,
1042        brief: bool,
1043        professional: bool,
1044        tools: &[ToolDefinition],
1045        reasoning_history: Option<&str>,
1046        mcp_tools: &[crate::agent::mcp::McpTool],
1047    ) -> String {
1048        let mut sys = self.build_system_prompt_legacy(
1049            snark,
1050            chaos,
1051            brief,
1052            professional,
1053            tools,
1054            reasoning_history,
1055        );
1056
1057        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1058            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1059            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1060            for tool in mcp_tools {
1061                let description = tool
1062                    .description
1063                    .as_deref()
1064                    .unwrap_or("No description provided.");
1065                sys.push_str(&format!("- {}: {}\n", tool.name, description));
1066            }
1067        }
1068
1069        sys
1070    }
1071
1072    pub fn build_system_prompt_legacy(
1073        &self,
1074        snark: u8,
1075        _chaos: u8,
1076        brief: bool,
1077        professional: bool,
1078        tools: &[ToolDefinition],
1079        reasoning_history: Option<&str>,
1080    ) -> String {
1081        let current_context_length = self.current_context_length();
1082        if is_tiny_context_window(current_context_length) {
1083            return self.build_system_prompt_tiny(brief, professional);
1084        }
1085        if is_compact_context_window(current_context_length) {
1086            return self.build_system_prompt_compact(brief, professional, tools);
1087        }
1088
1089        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
1090        let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1091                                     - You are Hematite, a local coding system working on the user's machine.\n\
1092                                     - The running Hematite build is ");
1093        sys.push_str(&crate::hematite_version_display());
1094        sys.push_str(".\n\
1095                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1096                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1097                                     - For simple questions, answer briefly in plain language.\n\
1098                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1099                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1100                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1101                                     - Keep internal reasoning inside channel delimiters.\n\
1102                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1103                                     <turn|>\n\n");
1104
1105        if let Some(history) = reasoning_history {
1106            if !history.is_empty() {
1107                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1108                sys.push_str(history);
1109                sys.push_str("\n\n");
1110            }
1111        }
1112
1113        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
1114        if brief {
1115            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1116                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1117                          - Depth: Surface-level verification only.\n\n");
1118        } else {
1119            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1120                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1121                          - Depth: Full multi-step derivation required.\n\n");
1122        }
1123
1124        // IDENTITY & ENVIRONMENT
1125        let os = std::env::consts::OS;
1126        if professional {
1127            sys.push_str(&format!(
1128                "You are Hematite, a local coding system running on {}. \
1129                 The TUI is one interface layer, not your whole identity. \
1130                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1131                 Skip filler and keep the focus on the work.\n",
1132                os
1133            ));
1134        } else {
1135            sys.push_str(&format!(
1136                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1137                 The terminal UI is only one surface of the system. \
1138                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1139                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1140                self.species, snark, os
1141            ));
1142        }
1143
1144        // Inject loaded model and context window so the model knows its own budget.
1145        let current_model = self.current_model();
1146        if !current_model.is_empty() {
1147            sys.push_str(&format!(
1148                "Loaded model: {} | Context window: {} tokens. \
1149                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1150                current_model, current_context_length
1151            ));
1152            if is_gemma4_model_name(&current_model) {
1153                sys.push_str(
1154                    "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1155                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1156                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1157                );
1158            }
1159        } else {
1160            sys.push_str(&format!(
1161                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1162                current_context_length
1163            ));
1164        }
1165
1166        // PROTOCOL & TOOLS
1167        let shell_desc = if cfg!(target_os = "windows") {
1168            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1169             - Use ONLY for builds, tests, or file migrations. \n\
1170             - You MUST use the `powershell` tool directly. \n\
1171             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1172        } else {
1173            "[EXTERNAL SHELL]: `bash` (Unix).\n\
1174             - Use ONLY for builds, tests, or file migrations. \n\
1175             - NEVER wrap bash in other shells. \n\n"
1176        };
1177
1178        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1179                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1180                      - These are the ONLY way to explore and modify code. \n\
1181                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1182        sys.push_str(shell_desc);
1183
1184        // ANTI-LOOPING & SELF-AUDIT
1185        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1186                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1187
1188        if brief {
1189            sys.push_str(
1190                "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1191            );
1192        }
1193
1194        sys.push_str("## CORE DIRECTIVES\n\
1195                       1. REASONING: Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1196                       2. CONCISENESS: After <think>, output ONE concise technical sentence or code block. Nothing else.\n\
1197                       3. GROUNDEDNESS: Never invent tools, channels, or files. If a detail is not verified from tool output, say `uncertain`. Answer from stable Hematite capabilities unless repo implementation is requested.\n\n\
1198                       ## ARCHITECTURAL DISCIPLINE\n\
1199                       - HOST INSPECTION PRIORITY: MANDATORY. For all diagnostic questions (load, CPU/RAM, processes, toolchains, network, ports, OS config, log-checks), prefer `inspect_host` over raw `shell`. If `env_doctor` answers, do not follow with `path` unless requested.\n\
1200                       - WORKFLOW PRIORITY: Prefer `run_workspace_workflow` for project builds/tests and `run_hematite_maintainer_workflow` for app-level scripts over raw `shell`.\n\
1201                       - PROOF BEFORE ACTION: `read_file` or `inspect_lines` before editing. For files >200 lines, `grep_files` for a pattern BEFORE reading.\n\
1202                       - PROOF BEFORE COMMIT: Do not `git_commit` until a successful `verify_build` exists for the latest changes.\n\
1203                       - BUILT-IN FIRST: Prefer internal file tools over `mcp__filesystem__*` unless MCP is explicitly required.\n\n\
1204                       ## TECHNIQUE & SAFETY\n\
1205                       - SHELL DISCIPLINE: Risky `shell` calls REQUIRE a `reason` argument. Always use `powershell` on Windows; never `bash` or `/dev/null`.\n\
1206                       - EDIT & TOOL PRECISION: Use unique lines/anchors for `edit_file`. Do not call tools like 'think' or 'reasoning'. Paths and symbols are NOT tool names.\n\
1207                       - CONTEXT AWARENESS: Answer at the harness level (file ops, shell, build). Prefer real language examples (Python, C#, TS, Go). Never mention `mcp__*` tools unless active and relevant.\n\
1208                       - TOOLING DISCIPLINE: Prefer `describe_toolchain` over improvising tool surface from memory; preserve its identifiers exactly.");
1209
1210        // Scaffolding protocol — enforces build validation after project creation.
1211        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1212            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1213            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1214            4. Fix all errors before declaring success.\n\n\
1215            ## PRE-FLIGHT SCOPING PROTOCOL\n\
1216            Before attempting any multi-file task or complex refactor:\n\
1217            1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1218            2. Use `auto_pin_context` to keep those files in active context.\n\
1219            3. Only then proceed to deeper edits or research.\n\n\
1220            ## REFACTORING PROTOCOL\n\
1221            When modifying existing code or renaming symbols:\n\
1222            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1223            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1224            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1225
1226        // Inject CLAUDE.md / instruction files from the project directory.
1227        sys.push_str(&load_instruction_files());
1228
1229        // Inject cross-session memories synthesized by DeepReflect.
1230        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1231
1232        // Native Gemma-4 Tool Declarations
1233        if !tools.is_empty() {
1234            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1235            for tool in tools {
1236                let schema = serde_json::to_string(&tool.function.parameters)
1237                    .unwrap_or_else(|_| "{}".to_string());
1238                sys.push_str(&format!(
1239                    "<|tool>declaration:{}{}{}<tool|>\n",
1240                    tool.function.name, "{", schema
1241                ));
1242                sys.push_str(&format!("// {})\n", tool.function.description));
1243            }
1244        }
1245
1246        sys
1247    }
1248
1249    fn build_system_prompt_compact(
1250        &self,
1251        brief: bool,
1252        professional: bool,
1253        tools: &[ToolDefinition],
1254    ) -> String {
1255        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
1256        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
1257        let current_model = self.current_model();
1258        let current_context_length = self.current_context_length();
1259        let os = std::env::consts::OS;
1260
1261        let mut sys = String::from("<|turn>system\n<|think|>\n");
1262        sys.push_str(&format!(
1263            "You are Hematite {}, a local coding harness working on the user's machine.\n",
1264            crate::hematite_version_display()
1265        ));
1266        if professional {
1267            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1268        } else {
1269            sys.push_str(&format!(
1270                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1271                self.species
1272            ));
1273        }
1274        sys.push_str(&format!(
1275            "Model: {} | Context: {} tokens. Keep turns focused.\n",
1276            current_model, current_context_length
1277        ));
1278        if is_gemma4_model_name(&current_model) {
1279            sys.push_str(
1280                "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1281                 Raw regex patterns in grep_files, no slash delimiters.\n",
1282            );
1283        }
1284        if cfg!(target_os = "windows") {
1285            sys.push_str(&format!(
1286                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1287                os
1288            ));
1289        } else {
1290            sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1291        }
1292        if brief {
1293            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1294        }
1295
1296        sys.push_str(
1297            "\nCORE RULES:\n\
1298             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1299             - Verify after edits: run `verify_build` after code changes, before committing.\n\
1300             - One tool at a time. Do not batch unrelated tool calls.\n\
1301             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1302             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1303             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1304        );
1305
1306        if !tools.is_empty() {
1307            sys.push_str("\n# AVAILABLE TOOLS\n");
1308            for tool in tools {
1309                let desc: String = tool.function.description.chars().take(120).collect();
1310                sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1311            }
1312        }
1313
1314        sys.push_str("<turn|>\n");
1315        sys
1316    }
1317
1318    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1319        let current_model = self.current_model();
1320        let current_context_length = self.current_context_length();
1321        let os = std::env::consts::OS;
1322        let mut sys = format!(
1323            "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1324            crate::hematite_version_display()
1325        );
1326        if professional {
1327            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1328        } else {
1329            sys.push_str(&format!(
1330                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1331                self.species
1332            ));
1333        }
1334        if !current_model.is_empty() {
1335            sys.push_str(&format!(
1336                "Loaded model: {} | Context window: {} tokens.\n",
1337                current_model, current_context_length
1338            ));
1339        } else {
1340            sys.push_str(&format!(
1341                "Context window: {} tokens.\n",
1342                current_context_length
1343            ));
1344        }
1345        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1346        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1347        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1348        if cfg!(target_os = "windows") {
1349            sys.push_str(&format!(
1350                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1351                os
1352            ));
1353        } else {
1354            sys.push_str(&format!(
1355                "You are running on {}. Use the native Unix shell conventions.\n",
1356                os
1357            ));
1358        }
1359        if brief {
1360            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1361        }
1362        if is_gemma4_model_name(&current_model) {
1363            sys.push_str(
1364                "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1365            );
1366        }
1367        sys.push_str("<turn|>\n");
1368        sys
1369    }
1370
1371    // ── Non-streaming call (used for agentic turns with tool support) ─────────
1372
1373    /// Send messages to the model. Returns (text_content, tool_calls).
1374    /// Exactly one of the two will be Some on a successful response.
1375    pub async fn call_with_tools(
1376        &self,
1377        messages: &[ChatMessage],
1378        tools: &[ToolDefinition],
1379        // Override the model ID for this call. None = use the live runtime model.
1380        model_override: Option<&str>,
1381    ) -> Result<
1382        (
1383            Option<String>,
1384            Option<Vec<ToolCallResponse>>,
1385            Option<TokenUsage>,
1386            Option<String>,
1387        ),
1388        String,
1389    > {
1390        let _permit = self
1391            .kv_semaphore
1392            .acquire()
1393            .await
1394            .map_err(|e| e.to_string())?;
1395
1396        let current_model = self.current_model();
1397        let model = model_override.unwrap_or(current_model.as_str()).to_string();
1398        let filtered_tools = if cfg!(target_os = "windows") {
1399            tools
1400                .iter()
1401                .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1402                .cloned()
1403                .collect::<Vec<_>>()
1404        } else {
1405            tools.to_vec()
1406        };
1407
1408        let request_messages = if should_use_gemma_native_formatting(self, &model) {
1409            prepare_gemma_native_messages(messages)
1410        } else {
1411            messages.to_vec()
1412        };
1413
1414        // In compact context windows, restrict tools to the core coding set.
1415        // Full schemas for 36+ tools add 10k+ tokens via the model's chat template (e.g. Gemma 4).
1416        // Sending a small core set keeps schemas available for structured tool-call dispatch
1417        // while staying within the 16k budget.
1418        const COMPACT_CORE_TOOLS: &[&str] = &[
1419            "read_file",
1420            "inspect_lines",
1421            "edit_file",
1422            "write_file",
1423            "grep_files",
1424            "list_files",
1425            "verify_build",
1426            "shell",
1427        ];
1428        let effective_tools = if is_compact_context_window(self.current_context_length()) {
1429            let core: Vec<_> = filtered_tools
1430                .iter()
1431                .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1432                .cloned()
1433                .collect();
1434            if core.is_empty() {
1435                None
1436            } else {
1437                Some(core)
1438            }
1439        } else if filtered_tools.is_empty() {
1440            None
1441        } else {
1442            Some(filtered_tools)
1443        };
1444
1445        let request = ChatRequest {
1446            model: model.clone(),
1447            messages: request_messages,
1448            temperature: 0.2,
1449            stream: false,
1450            tools: effective_tools,
1451        };
1452
1453        // Exponential backoff: retry up to 3× on 5xx / timeout / connect errors.
1454        preflight_chat_request(
1455            &model,
1456            &request.messages,
1457            request.tools.as_deref().unwrap_or(&[]),
1458            self.current_context_length(),
1459        )?;
1460
1461        let mut last_err = String::new();
1462        let mut response_opt: Option<reqwest::Response> = None;
1463        for attempt in 0..3u32 {
1464            match self.client.post(&self.api_url).json(&request).send().await {
1465                Ok(res) if res.status().is_success() => {
1466                    response_opt = Some(res);
1467                    break;
1468                }
1469                Ok(res) if res.status().as_u16() >= 500 => {
1470                    last_err = format!("LM Studio error {}", res.status());
1471                }
1472                Ok(res) => {
1473                    // 4xx — don't retry
1474                    let status = res.status();
1475                    let body = res.text().await.unwrap_or_default();
1476                    let preview = &body[..body.len().min(300)];
1477                    return Err(format!("LM Studio error {}: {}", status, preview));
1478                }
1479                Err(e) if e.is_timeout() || e.is_connect() => {
1480                    last_err = format!("Request failed: {}", e);
1481                }
1482                Err(e) => return Err(format!("Request failed: {}", e)),
1483            }
1484            if attempt < 2 {
1485                let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1486                tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1487            }
1488        }
1489        let res = response_opt
1490            .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1491
1492        let body: ChatResponse = res
1493            .json()
1494            .await
1495            .map_err(|e| format!("Response parse error: {}", e))?;
1496
1497        if let Some(usage) = &body.usage {
1498            let mut econ = self.economics.lock().unwrap();
1499            econ.input_tokens += usage.prompt_tokens;
1500            econ.output_tokens += usage.completion_tokens;
1501        }
1502
1503        let choice = body
1504            .choices
1505            .into_iter()
1506            .next()
1507            .ok_or_else(|| "Empty response from model".to_string())?;
1508
1509        let finish_reason = choice.finish_reason;
1510        let mut tool_calls = choice.message.tool_calls;
1511        let mut content = choice.message.content;
1512
1513        // Gemma-4 Fallback: If the model outputs native <|tool_call|> tags in the text content,
1514        // extract them and treat them as valid tool calls.
1515        if let Some(raw_content) = &content {
1516            let native_calls = extract_native_tool_calls(raw_content);
1517            if !native_calls.is_empty() {
1518                let mut existing = tool_calls.unwrap_or_default();
1519                existing.extend(native_calls);
1520                tool_calls = Some(existing);
1521                let stripped = strip_native_tool_call_text(raw_content);
1522                content = if stripped.trim().is_empty() {
1523                    None
1524                } else {
1525                    Some(stripped)
1526                };
1527            }
1528        }
1529
1530        if is_gemma4_model_name(&model) {
1531            if let Some(calls) = tool_calls.as_mut() {
1532                for call in calls.iter_mut() {
1533                    call.function.arguments = normalize_tool_argument_string(
1534                        &call.function.name,
1535                        &call.function.arguments,
1536                    );
1537                }
1538            }
1539        }
1540
1541        // Qwen3 Fallback: When the model generates tool calls inside a <think> block,
1542        // LM Studio routes the entire thinking output (including <tool_call> XML) to
1543        // `reasoning_content` instead of `tool_calls`. If content is empty and we have
1544        // no tool calls yet, check reasoning_content for embedded tool call markup.
1545        let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1546        if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1547            && content
1548                .as_ref()
1549                .map(|s| s.trim().is_empty())
1550                .unwrap_or(true)
1551            && !reasoning_text.is_empty()
1552        {
1553            let recovered = extract_native_tool_calls(&reasoning_text);
1554            if !recovered.is_empty() {
1555                tool_calls = Some(recovered);
1556                // Clear content so downstream code doesn't see an empty string.
1557                content = None;
1558            }
1559        }
1560
1561        Ok((content, tool_calls, body.usage, finish_reason))
1562    }
1563
1564    // ── Streaming call (used for plain-text responses) ────────────────────────
1565
1566    /// Stream a conversation (no tools). Emits Token/Done/Error events.
1567    pub async fn stream_messages(
1568        &self,
1569        messages: &[ChatMessage],
1570        tx: mpsc::Sender<InferenceEvent>,
1571    ) -> Result<(), Box<dyn std::error::Error>> {
1572        let current_model = self.current_model();
1573        let request_messages = if should_use_gemma_native_formatting(self, &current_model) {
1574            prepare_gemma_native_messages(messages)
1575        } else {
1576            messages
1577                .iter()
1578                .map(|m| {
1579                    let mut clone = m.clone();
1580                    let current_text = m.content.as_str();
1581                    if !current_text.starts_with("<|turn>") {
1582                        clone.content = MessageContent::Text(format!(
1583                            "<|turn>{}\n{}\n<turn|>",
1584                            m.role, current_text
1585                        ));
1586                    }
1587                    clone
1588                })
1589                .collect()
1590        };
1591
1592        let request = ChatRequest {
1593            model: current_model.clone(),
1594            messages: request_messages,
1595            temperature: 0.7,
1596            stream: true,
1597            tools: None,
1598        };
1599
1600        if let Err(e) = preflight_chat_request(
1601            &current_model,
1602            &request.messages,
1603            &[],
1604            self.current_context_length(),
1605        ) {
1606            let tag = classify_runtime_failure_tag(&e);
1607            let _ = tx
1608                .send(InferenceEvent::ProviderStatus {
1609                    state: provider_state_for_failure_tag(tag),
1610                    summary: compact_runtime_failure_summary(tag, &e),
1611                })
1612                .await;
1613            let _ = tx
1614                .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1615                .await;
1616            let _ = tx.send(InferenceEvent::Done).await;
1617            return Ok(());
1618        }
1619
1620        let mut last_err = String::new();
1621        let mut response_opt: Option<reqwest::Response> = None;
1622        for attempt in 0..2u32 {
1623            match self.client.post(&self.api_url).json(&request).send().await {
1624                Ok(res) if res.status().is_success() => {
1625                    response_opt = Some(res);
1626                    break;
1627                }
1628                Ok(res) if res.status().as_u16() >= 500 => {
1629                    last_err = format!("LM Studio error {}", res.status());
1630                }
1631                Ok(res) => {
1632                    let status = res.status();
1633                    let body = res.text().await.unwrap_or_default();
1634                    let preview = &body[..body.len().min(300)];
1635                    let detail = format!("LM Studio error {}: {}", status, preview);
1636                    let tag = classify_runtime_failure_tag(&detail);
1637                    let _ = tx
1638                        .send(InferenceEvent::ProviderStatus {
1639                            state: provider_state_for_failure_tag(tag),
1640                            summary: compact_runtime_failure_summary(tag, &detail),
1641                        })
1642                        .await;
1643                    let _ = tx
1644                        .send(InferenceEvent::Error(format_runtime_failure_message(
1645                            &detail,
1646                        )))
1647                        .await;
1648                    let _ = tx.send(InferenceEvent::Done).await;
1649                    return Ok(());
1650                }
1651                Err(e) if e.is_timeout() || e.is_connect() => {
1652                    last_err = format!("Request failed: {}", e);
1653                }
1654                Err(e) => {
1655                    let detail = format!("Request failed: {}", e);
1656                    let tag = classify_runtime_failure_tag(&detail);
1657                    let _ = tx
1658                        .send(InferenceEvent::ProviderStatus {
1659                            state: provider_state_for_failure_tag(tag),
1660                            summary: compact_runtime_failure_summary(tag, &detail),
1661                        })
1662                        .await;
1663                    let _ = tx
1664                        .send(InferenceEvent::Error(format_runtime_failure_message(
1665                            &detail,
1666                        )))
1667                        .await;
1668                    let _ = tx.send(InferenceEvent::Done).await;
1669                    return Ok(());
1670                }
1671            }
1672            if attempt < 1 {
1673                let _ = tx
1674                    .send(InferenceEvent::ProviderStatus {
1675                        state: ProviderRuntimeState::Recovering,
1676                        summary: "LM Studio degraded during stream startup; retrying once.".into(),
1677                    })
1678                    .await;
1679                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1680            }
1681        }
1682        let Some(res) = response_opt else {
1683            let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1684            let tag = classify_runtime_failure_tag(&detail);
1685            let _ = tx
1686                .send(InferenceEvent::ProviderStatus {
1687                    state: provider_state_for_failure_tag(tag),
1688                    summary: compact_runtime_failure_summary(tag, &detail),
1689                })
1690                .await;
1691            let _ = tx
1692                .send(InferenceEvent::Error(format_runtime_failure_message(
1693                    &detail,
1694                )))
1695                .await;
1696            let _ = tx.send(InferenceEvent::Done).await;
1697            return Ok(());
1698        };
1699
1700        use futures::StreamExt;
1701        let mut byte_stream = res.bytes_stream();
1702
1703        // [Collaborative Strategy] TokenBuffer refactor suggested by Hematite local agent.
1704        // Aggregates tokens to ensure coherent linguistic chunks for UI/Voice.
1705        let mut line_buffer = String::new();
1706        let mut content_buffer = String::new();
1707        let mut past_think = false;
1708        let mut emitted_any_content = false;
1709        let mut emitted_live_status = false;
1710
1711        // Immediate cancel gate: break *before* awaiting the stream
1712        // so Escape works even when LM Studio is silent between chunks.
1713        loop {
1714            let next = tokio::select! {
1715                // Race: next SSE chunk vs cancel poll
1716                chunk = byte_stream.next() => chunk,
1717                _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1718                    if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1719                        break;
1720                    }
1721                    continue;
1722                }
1723            };
1724
1725            let Some(item) = next else { break };
1726
1727            let chunk = match item {
1728                Ok(chunk) => chunk,
1729                Err(e) => {
1730                    let detail = format!("Request failed: {}", e);
1731                    let tag = classify_runtime_failure_tag(&detail);
1732                    let _ = tx
1733                        .send(InferenceEvent::ProviderStatus {
1734                            state: provider_state_for_failure_tag(tag),
1735                            summary: compact_runtime_failure_summary(tag, &detail),
1736                        })
1737                        .await;
1738                    let _ = tx
1739                        .send(InferenceEvent::Error(format_runtime_failure_message(
1740                            &detail,
1741                        )))
1742                        .await;
1743                    let _ = tx.send(InferenceEvent::Done).await;
1744                    return Ok(());
1745                }
1746            };
1747            line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1748
1749            while let Some(pos) = line_buffer.find("\n\n") {
1750                let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1751                let data_pos = match event_str.find("data: ") {
1752                    Some(p) => p,
1753                    None => continue,
1754                };
1755
1756                let data = event_str[data_pos + 6..].trim();
1757                if data == "[DONE]" {
1758                    break;
1759                }
1760
1761                if let Ok(json) = serde_json::from_str::<Value>(data) {
1762                    let delta = &json["choices"][0]["delta"];
1763
1764                    // Process reasoning/thought deltas (Qwen/O1 style)
1765                    if let Some(reasoning) = delta["reasoning_content"]
1766                        .as_str()
1767                        .or_else(|| delta["thought"].as_str())
1768                    {
1769                        if !reasoning.is_empty() {
1770                            past_think = false; // We are in reasoning mode
1771                            content_buffer.push_str(reasoning);
1772                            if content_buffer.len() > 30
1773                                && (reasoning.contains('\n') || reasoning.contains('.'))
1774                            {
1775                                let _ = tx
1776                                    .send(InferenceEvent::Thought(content_buffer.clone()))
1777                                    .await;
1778                                emitted_any_content = true;
1779                                content_buffer.clear();
1780                            }
1781                        }
1782                    }
1783
1784                    // Process standard content deltas
1785                    if let Some(content) = delta["content"].as_str() {
1786                        if content.is_empty() {
1787                            continue;
1788                        }
1789
1790                        // Auto-transition: if we have content but were in 'thinking' mode,
1791                        // and haven't seen an explicit tag, assume reasoning is over once content is non-empty.
1792                        if !past_think && !content_buffer.is_empty() && !content.trim().is_empty() {
1793                            // Only transition if the content isn't logically part of the thinking
1794                            // block (some models mix them). Standard heuristic: first non-whitespace content.
1795                            let _ = tx
1796                                .send(InferenceEvent::Thought(content_buffer.clone()))
1797                                .await;
1798                            content_buffer.clear();
1799                            past_think = true;
1800                        }
1801
1802                        if !past_think {
1803                            let lc = content.to_lowercase();
1804                            let close = lc
1805                                .find("<channel|>")
1806                                .map(|i| (i, "<channel|>".len()))
1807                                .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1808
1809                            if let Some((tag_start, tag_len)) = close {
1810                                // Flush any existing thought buffer
1811                                let before = &content[..tag_start];
1812                                content_buffer.push_str(before);
1813                                if !content_buffer.trim().is_empty() {
1814                                    let _ = tx
1815                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1816                                        .await;
1817                                    emitted_any_content = true;
1818                                }
1819                                content_buffer.clear();
1820
1821                                past_think = true;
1822                                let after = content[tag_start + tag_len..].trim_start_matches('\n');
1823                                content_buffer.push_str(after);
1824                            } else {
1825                                // Still in reasoning block
1826                                content_buffer.push_str(content);
1827                                if content_buffer.len() > 30
1828                                    && (content.contains('\n') || content.contains('.'))
1829                                {
1830                                    let _ = tx
1831                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1832                                        .await;
1833                                    emitted_any_content = true;
1834                                    content_buffer.clear();
1835                                }
1836                            }
1837                        } else {
1838                            // PAST THINK: final answer tokens.
1839                            content_buffer.push_str(content);
1840                            let is_boundary = content.contains(' ')
1841                                || content.contains('.')
1842                                || content.contains('!')
1843                                || content.contains('?');
1844
1845                            if content_buffer.len() > 10 && is_boundary {
1846                                if !emitted_live_status {
1847                                    let _ = tx
1848                                        .send(InferenceEvent::ProviderStatus {
1849                                            state: ProviderRuntimeState::Live,
1850                                            summary: String::new(),
1851                                        })
1852                                        .await;
1853                                    emitted_live_status = true;
1854                                }
1855                                let _ =
1856                                    tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1857                                emitted_any_content = true;
1858                                content_buffer.clear();
1859                            }
1860                        }
1861                    }
1862                }
1863            }
1864        }
1865
1866        // Final Flush
1867        if !content_buffer.is_empty() {
1868            if past_think {
1869                if !emitted_live_status {
1870                    let _ = tx
1871                        .send(InferenceEvent::ProviderStatus {
1872                            state: ProviderRuntimeState::Live,
1873                            summary: String::new(),
1874                        })
1875                        .await;
1876                }
1877                let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1878            } else {
1879                let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1880            }
1881            emitted_any_content = true;
1882        }
1883
1884        if !emitted_any_content {
1885            let _ = tx
1886                .send(InferenceEvent::ProviderStatus {
1887                    state: ProviderRuntimeState::EmptyResponse,
1888                    summary: compact_runtime_failure_summary(
1889                        "empty_model_response",
1890                        "Empty response from model",
1891                    ),
1892                })
1893                .await;
1894            let _ = tx
1895                .send(InferenceEvent::Error(format_runtime_failure_message(
1896                    "Empty response from model",
1897                )))
1898                .await;
1899            let _ = tx.send(InferenceEvent::Done).await;
1900            return Ok(());
1901        }
1902
1903        let _ = tx.send(InferenceEvent::Done).await;
1904        Ok(())
1905    }
1906
1907    /// Single-turn streaming (legacy helper used by startup sequence).
1908    pub async fn stream_generation(
1909        &self,
1910        prompt: &str,
1911        snark: u8,
1912        chaos: u8,
1913        brief: bool,
1914        professional: bool,
1915        tx: mpsc::Sender<InferenceEvent>,
1916    ) -> Result<(), Box<dyn std::error::Error>> {
1917        let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1918        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1919        self.stream_messages(&messages, tx).await
1920    }
1921
1922    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
1923
1924    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
1925    pub async fn generate_task_worker(
1926        &self,
1927        prompt: &str,
1928        professional: bool,
1929    ) -> Result<String, String> {
1930        let current_model = self.current_model();
1931        let model = self
1932            .worker_model
1933            .as_deref()
1934            .unwrap_or(current_model.as_str());
1935        self.generate_task_with_model(prompt, 0.1, professional, model)
1936            .await
1937    }
1938
1939    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1940        self.generate_task_with_temp(prompt, 0.1, professional)
1941            .await
1942    }
1943
1944    pub async fn generate_task_with_temp(
1945        &self,
1946        prompt: &str,
1947        temp: f32,
1948        professional: bool,
1949    ) -> Result<String, String> {
1950        let current_model = self.current_model();
1951        self.generate_task_with_model(prompt, temp, professional, &current_model)
1952            .await
1953    }
1954
1955    pub async fn generate_task_with_model(
1956        &self,
1957        prompt: &str,
1958        temp: f32,
1959        professional: bool,
1960        model: &str,
1961    ) -> Result<String, String> {
1962        let _permit = self
1963            .kv_semaphore
1964            .acquire()
1965            .await
1966            .map_err(|e| e.to_string())?;
1967
1968        let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1969        let request_messages = if should_use_gemma_native_formatting(self, model) {
1970            prepare_gemma_native_messages(&[
1971                ChatMessage::system(&system),
1972                ChatMessage::user(prompt),
1973            ])
1974        } else {
1975            vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1976        };
1977        let request = ChatRequest {
1978            model: model.to_string(),
1979            messages: request_messages,
1980            temperature: temp,
1981            stream: false,
1982            tools: None,
1983        };
1984
1985        preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1986
1987        let res = self
1988            .client
1989            .post(&self.api_url)
1990            .json(&request)
1991            .send()
1992            .await
1993            .map_err(|e| format!("LM Studio request failed: {}", e))?;
1994
1995        let body: ChatResponse = res
1996            .json()
1997            .await
1998            .map_err(|e| format!("Failed to parse response: {}", e))?;
1999
2000        body.choices
2001            .first()
2002            .and_then(|c| c.message.content.clone())
2003            .ok_or_else(|| "Empty response from model".to_string())
2004    }
2005
2006    // ── History management ────────────────────────────────────────────────────
2007
2008    /// Prune middle turns when context grows too large, keeping system + recent N.
2009    #[allow(dead_code)]
2010    pub fn snip_history(
2011        &self,
2012        turns: &[ChatMessage],
2013        max_tokens_estimate: usize,
2014        keep_recent: usize,
2015    ) -> Vec<ChatMessage> {
2016        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
2017        if total_chars / 4 <= max_tokens_estimate {
2018            return turns.to_vec();
2019        }
2020        let keep = keep_recent.min(turns.len());
2021        let mut snipped = vec![turns[0].clone()];
2022        if turns.len() > keep + 1 {
2023            snipped.push(ChatMessage::system(&format!(
2024                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2025                turns.len() - keep - 1
2026            )));
2027            snipped.extend_from_slice(&turns[turns.len() - keep..]);
2028        } else {
2029            snipped = turns.to_vec();
2030        }
2031        snipped
2032    }
2033}
2034
2035fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2036    serde_json::to_vec(value)
2037        .ok()
2038        .map_or(0, |bytes| bytes.len() / 4 + 1)
2039}
2040
2041const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2042
2043fn estimate_message_tokens(message: &ChatMessage) -> usize {
2044    let content_tokens = match &message.content {
2045        MessageContent::Text(s) => s.len() / 4 + 1,
2046        MessageContent::Parts(parts) => parts
2047            .iter()
2048            .map(|part| match part {
2049                ContentPart::Text { text } => text.len() / 4 + 1,
2050                // Image payloads are transported as data URLs, but their base64
2051                // length should not be treated like plain text context pressure.
2052                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2053            })
2054            .sum(),
2055    };
2056    let tool_tokens: usize = message
2057        .tool_calls
2058        .iter()
2059        .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2060        .sum();
2061    content_tokens + tool_tokens + 6
2062}
2063
2064pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2065    messages.iter().map(estimate_message_tokens).sum()
2066}
2067
2068fn reserved_output_tokens(context_length: usize) -> usize {
2069    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2070    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2071}
2072
2073pub fn estimate_prompt_pressure(
2074    messages: &[ChatMessage],
2075    tools: &[ToolDefinition],
2076    context_length: usize,
2077) -> (usize, usize, usize, u8) {
2078    let estimated_input_tokens =
2079        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2080    let reserved_output = reserved_output_tokens(context_length);
2081    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2082    let percent = if context_length == 0 {
2083        0
2084    } else {
2085        ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2086    };
2087    (
2088        estimated_input_tokens,
2089        reserved_output,
2090        estimated_total,
2091        percent,
2092    )
2093}
2094
2095fn preflight_chat_request(
2096    model: &str,
2097    messages: &[ChatMessage],
2098    tools: &[ToolDefinition],
2099    context_length: usize,
2100) -> Result<(), String> {
2101    let (estimated_input_tokens, reserved_output, estimated_total, _) =
2102        estimate_prompt_pressure(messages, tools, context_length);
2103
2104    if estimated_total > context_length {
2105        return Err(format!(
2106            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2107            model, estimated_input_tokens, reserved_output, estimated_total, context_length
2108        ));
2109    }
2110
2111    Ok(())
2112}
2113
2114/// Walk from CWD up to 4 parent directories and collect instruction files.
2115/// Looks for CLAUDE.md, CLAUDE.local.md, and .hematite/instructions.md.
2116/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
2117fn load_instruction_files() -> String {
2118    use std::collections::hash_map::DefaultHasher;
2119    use std::collections::HashSet;
2120    use std::hash::{Hash, Hasher};
2121
2122    let Ok(cwd) = std::env::current_dir() else {
2123        return String::new();
2124    };
2125    let mut result = String::new();
2126    let mut seen: HashSet<u64> = HashSet::new();
2127    let mut total_chars: usize = 0;
2128    const MAX_TOTAL: usize = 12_000;
2129    const MAX_PER_FILE: usize = 4_000;
2130
2131    let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2132
2133    let mut dir = cwd.clone();
2134    for _ in 0..4 {
2135        for name in &candidates {
2136            let path = dir.join(name);
2137            if !path.exists() {
2138                continue;
2139            }
2140            let Ok(content) = std::fs::read_to_string(&path) else {
2141                continue;
2142            };
2143            if content.trim().is_empty() {
2144                continue;
2145            }
2146
2147            let mut hasher = DefaultHasher::new();
2148            content.hash(&mut hasher);
2149            let h = hasher.finish();
2150            if !seen.insert(h) {
2151                continue;
2152            }
2153
2154            let truncated = if content.len() > MAX_PER_FILE {
2155                format!("{}...[truncated]", &content[..MAX_PER_FILE])
2156            } else {
2157                content
2158            };
2159
2160            if total_chars + truncated.len() > MAX_TOTAL {
2161                break;
2162            }
2163            total_chars += truncated.len();
2164            result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2165        }
2166        match dir.parent().map(|p| p.to_owned()) {
2167            Some(p) => dir = p,
2168            None => break,
2169        }
2170    }
2171
2172    if result.is_empty() {
2173        return String::new();
2174    }
2175    format!("\n\n# Project Instructions\n{}", result)
2176}
2177
2178pub fn extract_think_block(text: &str) -> Option<String> {
2179    let lower = text.to_lowercase();
2180
2181    // Official Gemma-4 Native Tags
2182    let open_tag = "<|channel>thought";
2183    let close_tag = "<channel|>";
2184
2185    let start_pos = lower.find(open_tag)?;
2186    let content_start = start_pos + open_tag.len();
2187
2188    let close_pos = lower[content_start..]
2189        .find(close_tag)
2190        .map(|p| content_start + p)
2191        .unwrap_or(text.len());
2192
2193    let content = text[content_start..close_pos].trim();
2194    if content.is_empty() {
2195        None
2196    } else {
2197        Some(content.to_string())
2198    }
2199}
2200
2201pub fn strip_think_blocks(text: &str) -> String {
2202    // Fast-path: strip a stray </think> the model emits at the start when it skips
2203    // the opening tag (common with Qwen after tool calls). Strip it before the lower
2204    // allocation so it can't slip through any branch below.
2205    let text = {
2206        let t = text.trim_start();
2207        if t.to_lowercase().starts_with("</think>") {
2208            &t[8..]
2209        } else {
2210            text
2211        }
2212    };
2213
2214    let lower = text.to_lowercase();
2215
2216    // Use the official Gemma-4 closing tag — answer is everything after it.
2217    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2218        let answer = text[end..]
2219            .replace("<|channel>thought", "")
2220            .replace("<channel|>", "");
2221        return answer.trim().replace("\n\n\n", "\n\n").to_string();
2222    }
2223
2224    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
2225    let first_open = [
2226        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
2227        lower.find("<think>"),
2228        lower.find("<thought>"),
2229        lower.find("<|think|>"),
2230    ]
2231    .iter()
2232    .filter_map(|&x| x)
2233    .min();
2234
2235    if let Some(start) = first_open {
2236        if start > 0 {
2237            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2238        }
2239        return String::new();
2240    }
2241
2242    // If the model outputs 'naked' reasoning without tags:
2243    // Strip leading sentences like "The user asked..." or "I should present..."
2244    // if they appear before actual answer content.
2245    let naked_reasoning_phrases: &[&str] = &[
2246        "the user asked",
2247        "the user is asking",
2248        "the user wants",
2249        "i will structure",
2250        "i should provide",
2251        "i should give",
2252        "i should avoid",
2253        "i should note",
2254        "i should focus",
2255        "i should keep",
2256        "i should respond",
2257        "i should present",
2258        "i should display",
2259        "i should show",
2260        "i need to",
2261        "i can see from",
2262        "without being overly",
2263        "let me ",
2264        "necessary information in my identity",
2265        "was computed successfully",
2266        "computed successfully",
2267    ];
2268    let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2269    if is_naked_reasoning {
2270        let lines: Vec<&str> = text.lines().collect();
2271        if !lines.is_empty() {
2272            // Skip leading lines that are themselves reasoning prose or blank.
2273            // Stop skipping at the first line that looks like real answer content.
2274            let mut start_idx = 0;
2275            for (i, line) in lines.iter().enumerate() {
2276                let l = line.to_lowercase();
2277                let is_reasoning_line =
2278                    naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2279                if is_reasoning_line {
2280                    start_idx = i + 1;
2281                } else {
2282                    break;
2283                }
2284            }
2285            if start_idx < lines.len() {
2286                return lines[start_idx..]
2287                    .join("\n")
2288                    .trim()
2289                    .replace("\n\n\n", "\n\n")
2290                    .to_string();
2291            }
2292            // Entire response was reasoning prose — return empty.
2293            return String::new();
2294        }
2295    }
2296
2297    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
2298    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
2299    let cleaned = strip_xml_tool_call_artifacts(text);
2300    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2301}
2302
2303/// Remove stray XML tool-call closing/opening tags that local models occasionally
2304/// leak into visible output when they start-then-abandon a tool call.
2305fn strip_xml_tool_call_artifacts(text: &str) -> String {
2306    // Tags to remove (both open and close forms, case-insensitive).
2307    const XML_ARTIFACTS: &[&str] = &[
2308        "</tool_call>",
2309        "<tool_call>",
2310        "</function>",
2311        "<function>",
2312        "</parameter>",
2313        "<parameter>",
2314        "</arguments>",
2315        "<arguments>",
2316        "</tool_use>",
2317        "<tool_use>",
2318        "</invoke>",
2319        "<invoke>",
2320        // Stray think/reasoning closing tags that leak after block extraction.
2321        "</think>",
2322        "</thought>",
2323        "</thinking>",
2324    ];
2325    let mut out = text.to_string();
2326    for tag in XML_ARTIFACTS {
2327        // Case-insensitive replace
2328        while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2329            out.drain(pos..pos + tag.len());
2330        }
2331    }
2332    // Collapse any blank lines left behind
2333    out
2334}
2335
2336/// Extract native Gemma-4 <|tool_call|> tags from text.
2337/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
2338pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2339    use regex::Regex;
2340    let mut results = Vec::new();
2341
2342    // -- Format 1: Gemma 4 Native (call:name{args}) --
2343    let re_call = Regex::new(
2344        r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2345    ).unwrap();
2346    let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2347
2348    for cap in re_call.captures_iter(text) {
2349        let name = cap[1].to_string();
2350        let args_str = &cap[2];
2351        let mut arguments = serde_json::Map::new();
2352
2353        for arg_cap in re_arg.captures_iter(args_str) {
2354            let key = arg_cap[1].to_string();
2355            let val_raw = arg_cap
2356                .get(2)
2357                .map(|m| m.as_str())
2358                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2359                .unwrap_or("")
2360                .trim();
2361            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2362
2363            let val = if normalized_raw == "true" {
2364                Value::Bool(true)
2365            } else if normalized_raw == "false" {
2366                Value::Bool(false)
2367            } else if let Ok(n) = normalized_raw.parse::<i64>() {
2368                Value::Number(n.into())
2369            } else if let Ok(n) = normalized_raw.parse::<u64>() {
2370                Value::Number(n.into())
2371            } else if let Ok(n) = normalized_raw.parse::<f64>() {
2372                serde_json::Number::from_f64(n)
2373                    .map(Value::Number)
2374                    .unwrap_or(Value::String(normalized_raw.clone()))
2375            } else {
2376                Value::String(normalized_raw)
2377            };
2378
2379            arguments.insert(key, val);
2380        }
2381
2382        results.push(ToolCallResponse {
2383            id: format!("call_{}", rand::random::<u32>()),
2384            call_type: "function".to_string(),
2385            function: ToolCallFn {
2386                name,
2387                arguments: Value::Object(arguments).to_string(),
2388            },
2389        });
2390    }
2391
2392    // -- Format 2: XML (Qwen/Claude style) --
2393    let re_xml_call = Regex::new(
2394        r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
2395    ).unwrap();
2396    let re_xml_param =
2397        Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
2398
2399    for cap in re_xml_call.captures_iter(text) {
2400        let name = cap[1].to_string();
2401        let body = &cap[2];
2402        let mut arguments = serde_json::Map::new();
2403
2404        for p_cap in re_xml_param.captures_iter(body) {
2405            let key = p_cap[1].to_string();
2406            let val_raw = p_cap[2].trim();
2407            let val = if val_raw == "true" {
2408                Value::Bool(true)
2409            } else if val_raw == "false" {
2410                Value::Bool(false)
2411            } else if let Ok(n) = val_raw.parse::<i64>() {
2412                Value::Number(n.into())
2413            } else if let Ok(n) = val_raw.parse::<u64>() {
2414                Value::Number(n.into())
2415            } else {
2416                Value::String(val_raw.to_string())
2417            };
2418            arguments.insert(key, val);
2419        }
2420
2421        if !arguments.is_empty() || name == "clear_context" {
2422            results.push(ToolCallResponse {
2423                id: format!("call_{}", rand::random::<u32>()),
2424                call_type: "function".to_string(),
2425                function: ToolCallFn {
2426                    name,
2427                    arguments: Value::Object(arguments).to_string(),
2428                },
2429            });
2430        }
2431    }
2432
2433    results
2434}
2435
2436pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2437    let trimmed = raw.trim();
2438    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2439
2440    let mut value = match serde_json::from_str::<Value>(&candidate) {
2441        Ok(v) => v,
2442        Err(_) => return candidate,
2443    };
2444    normalize_tool_argument_value(tool_name, &mut value);
2445    value.to_string()
2446}
2447
2448fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2449    match value {
2450        Value::String(s) => *s = normalize_string_arg(s),
2451        Value::Array(items) => {
2452            for item in items {
2453                normalize_tool_argument_value(tool_name, item);
2454            }
2455        }
2456        Value::Object(map) => {
2457            for val in map.values_mut() {
2458                normalize_tool_argument_value(tool_name, val);
2459            }
2460            if tool_name == "grep_files" {
2461                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2462                    *pattern = normalize_regex_pattern(pattern);
2463                }
2464            }
2465            for key in ["path", "extension", "query", "command", "reason"] {
2466                if let Some(Value::String(s)) = map.get_mut(key) {
2467                    *s = normalize_string_arg(s);
2468                }
2469            }
2470        }
2471        _ => {}
2472    }
2473}
2474
2475fn unwrap_json_string_once(input: &str) -> Option<String> {
2476    if input.len() < 2 {
2477        return None;
2478    }
2479    let first = input.chars().next()?;
2480    let last = input.chars().last()?;
2481    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2482        return None;
2483    }
2484    let inner = &input[1..input.len() - 1];
2485    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2486    Some(unescaped.trim().to_string())
2487}
2488
2489fn normalize_string_arg(input: &str) -> String {
2490    let mut out = input.trim().to_string();
2491    while out.len() >= 2 {
2492        let mut changed = false;
2493        for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2494            if out.starts_with(start) && out.ends_with(end) {
2495                out = out[start.len()..out.len() - end.len()].trim().to_string();
2496                changed = true;
2497                break;
2498            }
2499        }
2500        if !changed {
2501            break;
2502        }
2503    }
2504    out
2505}
2506
2507fn normalize_regex_pattern(input: &str) -> String {
2508    let out = normalize_string_arg(input);
2509    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2510        out[1..out.len() - 1].to_string()
2511    } else {
2512        out
2513    }
2514}
2515
2516fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2517    let mut system_blocks = Vec::new();
2518    let mut prepared = Vec::new();
2519    let mut seeded = false;
2520
2521    for message in messages {
2522        if message.role == "system" {
2523            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2524                .trim()
2525                .to_string();
2526            if !cleaned.is_empty() {
2527                system_blocks.push(cleaned);
2528            }
2529            continue;
2530        }
2531
2532        let mut clone = message.clone();
2533        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2534
2535        if !seeded && message.role == "user" {
2536            let mut merged = String::new();
2537            if !system_blocks.is_empty() {
2538                merged.push_str("System instructions for this turn:\n");
2539                merged.push_str(&system_blocks.join("\n\n"));
2540                merged.push_str("\n\n");
2541            }
2542            merged.push_str(clone.content.as_str());
2543            clone.content = MessageContent::Text(merged);
2544            seeded = true;
2545        }
2546
2547        prepared.push(clone);
2548    }
2549
2550    if !seeded && !system_blocks.is_empty() {
2551        prepared.insert(
2552            0,
2553            ChatMessage::user(&format!(
2554                "System instructions for this turn:\n{}",
2555                system_blocks.join("\n\n")
2556            )),
2557        );
2558    }
2559
2560    prepared
2561}
2562
2563fn strip_legacy_turn_wrappers(text: &str) -> String {
2564    text.replace("<|turn>system\n", "")
2565        .replace("<|turn>user\n", "")
2566        .replace("<|turn>assistant\n", "")
2567        .replace("<|turn>tool\n", "")
2568        .replace("<turn|>", "")
2569        .trim()
2570        .to_string()
2571}
2572
2573pub fn strip_native_tool_call_text(text: &str) -> String {
2574    use regex::Regex;
2575    // Format 1: Gemma 4 Native
2576    let re_call = Regex::new(
2577        r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2578    ).unwrap();
2579    // Format 2: XML (Qwen/Claude style)
2580    let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
2581    let re_response =
2582        Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2583            .unwrap();
2584    let without_calls = re_call.replace_all(text, "");
2585    let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
2586    re_response
2587        .replace_all(without_xml.as_ref(), "")
2588        .trim()
2589        .to_string()
2590}
2591
2592#[cfg(test)]
2593mod tests {
2594    use super::*;
2595
2596    #[test]
2597    fn system_prompt_includes_running_hematite_version() {
2598        let engine = InferenceEngine::new(
2599            "http://localhost:1234/v1".to_string(),
2600            "strategist".to_string(),
2601            0,
2602        )
2603        .expect("engine");
2604
2605        let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2606        assert!(system.contains(crate::HEMATITE_VERSION));
2607    }
2608
2609    #[test]
2610    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2611        let text = r#"<|channel>thought
2612Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2613<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2614
2615        let calls = extract_native_tool_calls(text);
2616        assert_eq!(calls.len(), 1);
2617        assert_eq!(calls[0].function.name, "read_file");
2618
2619        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2620        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2621        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2622        assert_eq!(
2623            args.get("path").and_then(|v| v.as_str()),
2624            Some("src/ui/tui.rs")
2625        );
2626
2627        let stripped = strip_native_tool_call_text(text);
2628        assert!(!stripped.contains("<|tool_call"));
2629        assert!(!stripped.contains("<tool_call|>"));
2630    }
2631
2632    #[test]
2633    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2634        let text = r#"<|channel>thought
2635Planning.
2636<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2637Mapped src.
2638<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2639Read main.
2640<channel|>"#;
2641
2642        let calls = extract_native_tool_calls(text);
2643        assert_eq!(calls.len(), 2);
2644        assert_eq!(calls[0].function.name, "list_files");
2645        assert_eq!(calls[1].function.name, "read_file");
2646
2647        let stripped = strip_native_tool_call_text(text);
2648        assert!(!stripped.contains("<|tool_call"));
2649        assert!(!stripped.contains("<|tool_response"));
2650        assert!(!stripped.contains("<tool_response|>"));
2651    }
2652
2653    #[test]
2654    fn extracts_qwen_xml_tool_calls_from_reasoning() {
2655        let text = r#"Based on the project structure, I need to check the binary.
2656<tool_call>
2657<function=shell>
2658<parameter=command>
2659ls -la hematite.exe
2660</parameter>
2661<parameter=reason>
2662Check if the binary exists
2663</parameter>
2664</function>
2665</tool_call>"#;
2666
2667        let calls = extract_native_tool_calls(text);
2668        assert_eq!(calls.len(), 1);
2669        assert_eq!(calls[0].function.name, "shell");
2670
2671        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2672        assert_eq!(
2673            args.get("command").and_then(|v| v.as_str()),
2674            Some("ls -la hematite.exe")
2675        );
2676        assert_eq!(
2677            args.get("reason").and_then(|v| v.as_str()),
2678            Some("Check if the binary exists")
2679        );
2680
2681        let stripped = strip_native_tool_call_text(text);
2682        assert!(!stripped.contains("<tool_call>"));
2683        assert!(!stripped.contains("<function=shell>"));
2684    }
2685}
hematite/agent/inference.rs

hematite/agent/
inference.rs