hematite/agent/
inference.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7// ── Engine ────────────────────────────────────────────────────────────────────
8
9pub struct InferenceEngine {
10    pub client: reqwest::Client,
11    pub api_url: String,
12    /// Root URL of the LLM provider (e.g. `http://localhost:1234`).
13    /// All non-completions endpoints (models list, health, embeddings) are derived from this.
14    pub base_url: String,
15    pub species: String,
16    pub snark: u8,
17    pub kv_semaphore: Semaphore,
18    /// The model ID currently loaded in LM Studio (auto-detected on boot).
19    pub model: std::sync::RwLock<String>,
20    /// Context window length in tokens (auto-detected from LM Studio, default 32768).
21    pub context_length: std::sync::atomic::AtomicUsize,
22    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23    /// Optional model ID for worker-level tasks (Swarms / research).
24    pub worker_model: Option<String>,
25    /// Opt-in Gemma-native request shaping. Off by default.
26    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27    /// Global cancellation token for hard-interrupting the inference stream.
28    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32    let lower = model.to_ascii_lowercase();
33    lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37    is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40// ── OpenAI Tool Definition ────────────────────────────────────────────────────
41
42#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44    #[serde(rename = "type")]
45    pub tool_type: String,
46    pub function: ToolFunction,
47    #[serde(skip_serializing, skip_deserializing)]
48    pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53    pub name: String,
54    pub description: String,
55    pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60    RepoRead,
61    RepoWrite,
62    Runtime,
63    Architecture,
64    Toolchain,
65    Verification,
66    Git,
67    Research,
68    Vision,
69    Lsp,
70    Workflow,
71    External,
72    Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77    pub category: ToolCategory,
78    pub mutates_workspace: bool,
79    pub external_surface: bool,
80    pub trust_sensitive: bool,
81    pub read_only_friendly: bool,
82    pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86    if name.starts_with("mcp__") {
87        let lower = name.to_ascii_lowercase();
88        let mutates_workspace = [
89            "__edit",
90            "__write",
91            "__create",
92            "__move",
93            "__delete",
94            "__remove",
95            "__rename",
96            "__replace",
97            "__patch",
98        ]
99        .iter()
100        .any(|needle| lower.contains(needle));
101        return ToolMetadata {
102            category: ToolCategory::External,
103            mutates_workspace,
104            external_surface: true,
105            trust_sensitive: true,
106            read_only_friendly: !mutates_workspace,
107            plan_scope: false,
108        };
109    }
110
111    match name {
112        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113            category: ToolCategory::RepoRead,
114            mutates_workspace: false,
115            external_surface: false,
116            trust_sensitive: false,
117            read_only_friendly: true,
118            plan_scope: true,
119        },
120        "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121            category: ToolCategory::RepoWrite,
122            mutates_workspace: true,
123            external_surface: false,
124            trust_sensitive: true,
125            read_only_friendly: false,
126            plan_scope: true,
127        },
128        "map_project" | "trace_runtime_flow" => ToolMetadata {
129            category: ToolCategory::Architecture,
130            mutates_workspace: false,
131            external_surface: false,
132            trust_sensitive: false,
133            read_only_friendly: true,
134            plan_scope: false,
135        },
136        "describe_toolchain" => ToolMetadata {
137            category: ToolCategory::Toolchain,
138            mutates_workspace: false,
139            external_surface: false,
140            trust_sensitive: false,
141            read_only_friendly: true,
142            plan_scope: false,
143        },
144        "shell" => ToolMetadata {
145            category: ToolCategory::Runtime,
146            mutates_workspace: true,
147            external_surface: false,
148            trust_sensitive: true,
149            read_only_friendly: false,
150            plan_scope: false,
151        },
152        "inspect_host" => ToolMetadata {
153            category: ToolCategory::Runtime,
154            mutates_workspace: false,
155            external_surface: false,
156            trust_sensitive: false,
157            read_only_friendly: true,
158            plan_scope: false,
159        },
160        "verify_build" => ToolMetadata {
161            category: ToolCategory::Verification,
162            mutates_workspace: false,
163            external_surface: false,
164            trust_sensitive: false,
165            read_only_friendly: true,
166            plan_scope: false,
167        },
168        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
169            ToolMetadata {
170                category: ToolCategory::Git,
171                mutates_workspace: true,
172                external_surface: false,
173                trust_sensitive: true,
174                read_only_friendly: false,
175                plan_scope: false,
176            }
177        }
178        "research_web" | "fetch_docs" => ToolMetadata {
179            category: ToolCategory::Research,
180            mutates_workspace: false,
181            external_surface: false,
182            trust_sensitive: false,
183            read_only_friendly: true,
184            plan_scope: false,
185        },
186        "vision_analyze" => ToolMetadata {
187            category: ToolCategory::Vision,
188            mutates_workspace: false,
189            external_surface: false,
190            trust_sensitive: false,
191            read_only_friendly: true,
192            plan_scope: false,
193        },
194        "lsp_definitions"
195        | "lsp_references"
196        | "lsp_hover"
197        | "lsp_rename_symbol"
198        | "lsp_get_diagnostics"
199        | "lsp_search_symbol" => ToolMetadata {
200            category: ToolCategory::Lsp,
201            mutates_workspace: false,
202            external_surface: false,
203            trust_sensitive: false,
204            read_only_friendly: true,
205            plan_scope: false,
206        },
207        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
208            category: ToolCategory::Workflow,
209            mutates_workspace: false,
210            external_surface: false,
211            trust_sensitive: false,
212            read_only_friendly: true,
213            plan_scope: true,
214        },
215        "manage_tasks" => ToolMetadata {
216            category: ToolCategory::Workflow,
217            mutates_workspace: false,
218            external_surface: false,
219            trust_sensitive: false,
220            read_only_friendly: true,
221            plan_scope: false,
222        },
223        _ => ToolMetadata {
224            category: ToolCategory::Other,
225            mutates_workspace: false,
226            external_surface: false,
227            trust_sensitive: false,
228            read_only_friendly: true,
229            plan_scope: false,
230        },
231    }
232}
233
234// ── Message types ─────────────────────────────────────────────────────────────
235
236/// OpenAI-compatible chat message. Content can be a string (legacy) or a
237/// Vec of ContentPart (multimodal).
238#[derive(Serialize, Deserialize, Clone, Debug)]
239pub struct ChatMessage {
240    pub role: String,
241    /// Support both simple string content and complex multi-part content (Vision).
242    pub content: MessageContent,
243    /// Assistant messages may have tool calls. Default to empty vec, not null.
244    #[serde(default, skip_serializing_if = "Vec::is_empty")]
245    pub tool_calls: Vec<ToolCallResponse>,
246    /// Tool message references the original call.
247    #[serde(skip_serializing_if = "Option::is_none")]
248    pub tool_call_id: Option<String>,
249    /// Tool message name.
250    #[serde(skip_serializing_if = "Option::is_none")]
251    pub name: Option<String>,
252}
253
254#[derive(Serialize, Deserialize, Clone, Debug)]
255#[serde(untagged)]
256pub enum MessageContent {
257    Text(String),
258    Parts(Vec<ContentPart>),
259}
260
261#[derive(Serialize, Deserialize, Clone, Debug)]
262#[serde(tag = "type")]
263pub enum ContentPart {
264    #[serde(rename = "text")]
265    Text { text: String },
266    #[serde(rename = "image_url")]
267    ImageUrl { image_url: ImageUrlSource },
268}
269
270#[derive(Serialize, Deserialize, Clone, Debug)]
271pub struct ImageUrlSource {
272    pub url: String,
273}
274
275impl Default for MessageContent {
276    fn default() -> Self {
277        MessageContent::Text(String::new())
278    }
279}
280
281impl MessageContent {
282    pub fn as_str(&self) -> &str {
283        match self {
284            MessageContent::Text(s) => s,
285            MessageContent::Parts(parts) => {
286                for part in parts {
287                    if let ContentPart::Text { text } = part {
288                        return text;
289                    }
290                }
291                ""
292            }
293        }
294    }
295}
296
297impl ChatMessage {
298    pub fn system(content: &str) -> Self {
299        Self {
300            role: "system".into(),
301            content: MessageContent::Text(content.into()),
302            tool_calls: Vec::new(),
303            tool_call_id: None,
304            name: None,
305        }
306    }
307    pub fn user(content: &str) -> Self {
308        Self {
309            role: "user".into(),
310            content: MessageContent::Text(content.into()),
311            tool_calls: Vec::new(),
312            tool_call_id: None,
313            name: None,
314        }
315    }
316    pub fn user_with_image(text: &str, image_url: &str) -> Self {
317        let mut text_parts = text.to_string();
318        if !text_parts.contains("<|image|>") {
319            text_parts.push_str(" <|image|>");
320        }
321        Self {
322            role: "user".into(),
323            content: MessageContent::Parts(vec![
324                ContentPart::Text { text: text_parts },
325                ContentPart::ImageUrl {
326                    image_url: ImageUrlSource {
327                        url: image_url.into(),
328                    },
329                },
330            ]),
331            tool_calls: Vec::new(),
332            tool_call_id: None,
333            name: None,
334        }
335    }
336    pub fn assistant_text(content: &str) -> Self {
337        Self {
338            role: "assistant".into(),
339            content: MessageContent::Text(content.into()),
340            tool_calls: Vec::new(),
341            tool_call_id: None,
342            name: None,
343        }
344    }
345    pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
346        Self {
347            role: "assistant".into(),
348            content: MessageContent::Text(content.into()),
349            tool_calls: calls,
350            tool_call_id: None,
351            name: None,
352        }
353    }
354    pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
355        Self::tool_result_for_model(tool_call_id, fn_name, content, "")
356    }
357
358    /// Build a tool result message, applying Gemma 4 native markup only when the
359    /// loaded model is actually a Gemma 4 model.
360    pub fn tool_result_for_model(
361        tool_call_id: &str,
362        fn_name: &str,
363        content: &str,
364        model: &str,
365    ) -> Self {
366        let body = if is_gemma4_model_name(model) {
367            format!(
368                "<|tool_response>response:{}{}{}<tool_response|>",
369                fn_name, "{", content
370            )
371        } else {
372            content.to_string()
373        };
374        Self {
375            role: "tool".into(),
376            content: MessageContent::Text(body),
377            tool_calls: Vec::new(),
378            tool_call_id: Some(tool_call_id.into()),
379            name: Some(fn_name.into()),
380        }
381    }
382}
383
384// ── Tool call as returned by the model ───────────────────────────────────────
385
386#[derive(Serialize, Deserialize, Clone, Debug)]
387pub struct ToolCallResponse {
388    pub id: String,
389    #[serde(rename = "type")]
390    pub call_type: String,
391    pub function: ToolCallFn,
392}
393
394#[derive(Serialize, Deserialize, Clone, Debug)]
395pub struct ToolCallFn {
396    pub name: String,
397    /// JSON-encoded arguments string (as returned by the API).
398    pub arguments: String,
399}
400
401// ── HTTP request / response shapes ───────────────────────────────────────────
402
403#[derive(Serialize)]
404struct ChatRequest {
405    model: String,
406    messages: Vec<ChatMessage>,
407    temperature: f32,
408    stream: bool,
409    #[serde(skip_serializing_if = "Option::is_none")]
410    tools: Option<Vec<ToolDefinition>>,
411}
412
413#[derive(Deserialize, Debug)]
414struct ChatResponse {
415    choices: Vec<ResponseChoice>,
416    usage: Option<TokenUsage>,
417}
418
419#[derive(Deserialize, Debug, Clone)]
420pub struct TokenUsage {
421    pub prompt_tokens: usize,
422    pub completion_tokens: usize,
423    pub total_tokens: usize,
424    #[serde(default)]
425    pub prompt_cache_hit_tokens: usize,
426    #[serde(default)]
427    pub cache_read_input_tokens: usize,
428}
429
430#[derive(Deserialize, Debug)]
431struct ResponseChoice {
432    message: ResponseMessage,
433    #[serde(default)]
434    finish_reason: Option<String>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ResponseMessage {
439    content: Option<String>,
440    tool_calls: Option<Vec<ToolCallResponse>>,
441}
442
443const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
444const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
445
446fn is_tiny_context_window(context_length: usize) -> bool {
447    context_length <= 8_192
448}
449
450fn is_compact_context_window(context_length: usize) -> bool {
451    context_length > 8_192 && context_length <= 49_152
452}
453
454pub fn is_compact_context_window_pub(context_length: usize) -> bool {
455    is_compact_context_window(context_length)
456}
457
458fn is_provider_context_limit_detail(lower: &str) -> bool {
459    (lower.contains("n_keep") && lower.contains("n_ctx"))
460        || lower.contains("context length")
461        || lower.contains("keep from the initial prompt")
462        || lower.contains("prompt is greater than the context length")
463        || lower.contains("exceeds the context window")
464}
465
466fn classify_runtime_failure_tag(detail: &str) -> &'static str {
467    let lower = detail.to_ascii_lowercase();
468    if lower.contains("context_window_blocked")
469        || lower.contains("context ceiling reached")
470        || lower.contains("exceeds the")
471        || is_provider_context_limit_detail(&lower)
472    {
473        "context_window"
474    } else if lower.contains("empty response from model")
475        || lower.contains("model returned an empty response")
476    {
477        "empty_model_response"
478    } else if lower.contains("action blocked:")
479        || lower.contains("access denied")
480        || lower.contains("declined by user")
481    {
482        "tool_policy_blocked"
483    } else {
484        "provider_degraded"
485    }
486}
487
488fn runtime_failure_guidance(tag: &str) -> &'static str {
489    match tag {
490        "context_window" => {
491            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
492        }
493        "empty_model_response" => {
494            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
495        }
496        "tool_policy_blocked" => {
497            "Stay inside the allowed workflow or switch modes before retrying."
498        }
499        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
500    }
501}
502
503fn format_runtime_failure_message(detail: &str) -> String {
504    let tag = classify_runtime_failure_tag(detail);
505    format!(
506        "[failure:{}] {} Detail: {}",
507        tag,
508        runtime_failure_guidance(tag),
509        detail.trim()
510    )
511}
512
513#[derive(Debug, Clone, Copy, PartialEq, Eq)]
514pub enum ProviderRuntimeState {
515    Booting,
516    Live,
517    Recovering,
518    Degraded,
519    ContextWindow,
520    EmptyResponse,
521}
522
523#[derive(Debug, Clone, Copy, PartialEq, Eq)]
524pub enum McpRuntimeState {
525    Unconfigured,
526    Healthy,
527    Degraded,
528    Failed,
529}
530
531#[derive(Debug, Clone, Copy, PartialEq, Eq)]
532pub enum OperatorCheckpointState {
533    Idle,
534    RecoveringProvider,
535    BudgetReduced,
536    HistoryCompacted,
537    BlockedContextWindow,
538    BlockedPolicy,
539    BlockedRecentFileEvidence,
540    BlockedExactLineWindow,
541    BlockedToolLoop,
542    BlockedVerification,
543}
544
545impl OperatorCheckpointState {
546    pub fn label(self) -> &'static str {
547        match self {
548            OperatorCheckpointState::Idle => "idle",
549            OperatorCheckpointState::RecoveringProvider => "recovering_provider",
550            OperatorCheckpointState::BudgetReduced => "budget_reduced",
551            OperatorCheckpointState::HistoryCompacted => "history_compacted",
552            OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
553            OperatorCheckpointState::BlockedPolicy => "blocked_policy",
554            OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
555            OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
556            OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
557            OperatorCheckpointState::BlockedVerification => "blocked_verification",
558        }
559    }
560}
561
562fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
563    match tag {
564        "context_window" => ProviderRuntimeState::ContextWindow,
565        "empty_model_response" => ProviderRuntimeState::EmptyResponse,
566        _ => ProviderRuntimeState::Degraded,
567    }
568}
569
570fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
571    match tag {
572        "context_window" => {
573            "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
574                .to_string()
575        }
576        "empty_model_response" => {
577            "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
578                .to_string()
579        }
580        "tool_policy_blocked" => {
581            "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
582                .to_string()
583        }
584        _ => {
585            let mut excerpt = detail
586                .split_whitespace()
587                .take(12)
588                .collect::<Vec<_>>()
589                .join(" ");
590            if excerpt.len() > 110 {
591                excerpt.truncate(110);
592                excerpt.push_str("...");
593            }
594            if excerpt.is_empty() {
595                "LM Studio degraded; Hematite will retry once before surfacing a failure."
596                    .to_string()
597            } else {
598                format!("LM Studio degraded: {}", excerpt)
599            }
600        }
601    }
602}
603
604// ── Events pushed to the TUI ──────────────────────────────────────────────────
605
606#[derive(Debug)]
607pub enum InferenceEvent {
608    /// A text token to append to the current assistant message.
609    Token(String),
610    /// A text token to be displayed on screen but NOT spoken (e.g. startup greeting).
611    MutedToken(String),
612    /// Internal model reasoning (shown in side panel, not dialogue).
613    Thought(String),
614    /// Critical diagnostic feedback from the voice synthesis engine.
615    VoiceStatus(String),
616    /// A tool call is starting – show a status line in the TUI.
617    ToolCallStart {
618        id: String,
619        name: String,
620        args: String,
621    },
622    /// A tool call completed – show result in the TUI.
623    ToolCallResult {
624        id: String,
625        name: String,
626        output: String,
627        is_error: bool,
628    },
629    /// A risky tool requires explicit user approval.
630    /// The TUI must send `true` (approved) or `false` (rejected) via `responder`.
631    /// When `diff` is Some, the modal renders a coloured before/after diff preview.
632    ApprovalRequired {
633        id: String,
634        name: String,
635        display: String,
636        /// Pre-formatted diff: lines starting with "- " are removals, "+ " are additions,
637        /// "---" is a file header.  None means a plain high-risk approval (no diff).
638        diff: Option<String>,
639        responder: tokio::sync::oneshot::Sender<bool>,
640    },
641    /// The current agent turn is complete.
642    Done,
643    /// An error occurred during inference.
644    Error(String),
645    /// Compact provider/runtime state for the operator surface.
646    ProviderStatus {
647        state: ProviderRuntimeState,
648        summary: String,
649    },
650    /// Typed operator checkpoint/blocker state for SPECULAR and recovery UIs.
651    OperatorCheckpoint {
652        state: OperatorCheckpointState,
653        summary: String,
654    },
655    /// Typed recovery recipe summary for operator/debug surfaces.
656    RecoveryRecipe { summary: String },
657    /// Compact MCP/runtime server health for the operator surface.
658    McpStatus {
659        state: McpRuntimeState,
660        summary: String,
661    },
662    /// Current compaction pressure against the adaptive threshold.
663    CompactionPressure {
664        estimated_tokens: usize,
665        threshold_tokens: usize,
666        percent: u8,
667    },
668    /// Current total prompt-budget pressure against the live context window.
669    PromptPressure {
670        estimated_input_tokens: usize,
671        reserved_output_tokens: usize,
672        estimated_total_tokens: usize,
673        context_length: usize,
674        percent: u8,
675    },
676    /// A generic task progress update (e.g. for single-agent tool execution).
677    TaskProgress {
678        id: String,
679        label: String,
680        progress: u8,
681    },
682    /// Real-time token usage update from the API.
683    UsageUpdate(TokenUsage),
684    /// The current runtime profile detected from LM Studio.
685    RuntimeProfile {
686        model_id: String,
687        context_length: usize,
688    },
689    /// Vein index status after each incremental re-index.
690    VeinStatus {
691        file_count: usize,
692        embedded_count: usize,
693        docs_only: bool,
694    },
695    /// File paths the Vein surfaced as relevant to the current turn.
696    /// Used to populate ACTIVE CONTEXT with retrieval results.
697    VeinContext { paths: Vec<String> },
698    /// A new companion was hatched mid-session via /reroll.
699    SoulReroll {
700        species: String,
701        rarity: String,
702        shiny: bool,
703        personality: String,
704    },
705    /// Embed model loaded/unloaded mid-session.
706    EmbedProfile { model_id: Option<String> },
707}
708
709// ── Engine implementation ─────────────────────────────────────────────────────
710
711impl InferenceEngine {
712    pub fn new(
713        api_url: String,
714        species: String,
715        snark: u8,
716    ) -> Result<Self, Box<dyn std::error::Error>> {
717        let client = reqwest::Client::builder()
718            .timeout(std::time::Duration::from_secs(180))
719            .build()?;
720
721        // Extract http://host:port as the base for all non-completions endpoints.
722        let base_url = {
723            let trimmed = api_url.trim_end_matches('/');
724            if let Some(scheme_end) = trimmed.find("://") {
725                let after_scheme = &trimmed[scheme_end + 3..];
726                if let Some(path_start) = after_scheme.find('/') {
727                    format!(
728                        "{}://{}",
729                        &trimmed[..scheme_end],
730                        &after_scheme[..path_start]
731                    )
732                } else {
733                    trimmed.to_string()
734                }
735            } else {
736                trimmed.to_string()
737            }
738        };
739
740        let api_url = if api_url.ends_with("/chat/completions") {
741            api_url
742        } else if api_url.ends_with("/") {
743            format!("{}chat/completions", api_url)
744        } else {
745            format!("{}/chat/completions", api_url)
746        };
747
748        Ok(Self {
749            client,
750            api_url,
751            base_url,
752            species,
753            snark,
754            kv_semaphore: Semaphore::new(3),
755            model: std::sync::RwLock::new(String::new()),
756            context_length: std::sync::atomic::AtomicUsize::new(32_768), // Gemma-4 Sweet Spot (32K)
757            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
758            worker_model: None,
759            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
760            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
761        })
762    }
763
764    pub fn set_gemma_native_formatting(&self, enabled: bool) {
765        self.gemma_native_formatting
766            .store(enabled, std::sync::atomic::Ordering::SeqCst);
767    }
768
769    pub fn gemma_native_formatting_enabled(&self) -> bool {
770        self.gemma_native_formatting
771            .load(std::sync::atomic::Ordering::SeqCst)
772    }
773
774    pub fn current_model(&self) -> String {
775        self.model.read().map(|g| g.clone()).unwrap_or_default()
776    }
777
778    pub fn current_context_length(&self) -> usize {
779        self.context_length
780            .load(std::sync::atomic::Ordering::SeqCst)
781    }
782
783    pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
784        if let Ok(mut guard) = self.model.write() {
785            *guard = model.to_string();
786        }
787        self.context_length
788            .store(context_length, std::sync::atomic::Ordering::SeqCst);
789    }
790
791    /// Returns true if LM Studio is reachable.
792    pub async fn health_check(&self) -> bool {
793        let url = format!("{}/v1/models", self.base_url);
794        match self.client.get(&url).send().await {
795            Ok(resp) => resp.status().is_success(),
796            Err(_) => false,
797        }
798    }
799
800    /// Query /api/v0/models and return the first loaded chat model id.
801    /// Uses /api/v0/models (not /v1/models) because the OpenAI-compat endpoint
802    /// omits the `type` field, making it impossible to distinguish embedding
803    /// models from chat models. Falls back to /v1/models with a name heuristic
804    /// if /api/v0/models is unavailable.
805    /// Returns Some("") when LM Studio is reachable but no chat model is loaded
806    /// so callers can distinguish "offline" (None) from "no chat model" (Some("")).
807    pub async fn get_loaded_model(&self) -> Option<String> {
808        #[derive(Deserialize)]
809        struct ModelList {
810            data: Vec<ModelEntry>,
811        }
812        #[derive(Deserialize)]
813        struct ModelEntry {
814            id: String,
815            #[serde(rename = "type", default)]
816            model_type: String,
817            #[serde(default)]
818            state: String,
819        }
820
821        // Try /api/v0/models first — it has type and state fields.
822        if let Ok(resp) = self
823            .client
824            .get(format!("{}/api/v0/models", self.base_url))
825            .send()
826            .await
827        {
828            if let Ok(list) = resp.json::<ModelList>().await {
829                let chat_model = list
830                    .data
831                    .into_iter()
832                    .find(|m| m.model_type != "embeddings" && m.state == "loaded")
833                    .map(|m| m.id)
834                    .unwrap_or_default();
835                return Some(chat_model);
836            }
837        }
838
839        // Fallback: /v1/models lacks type info — use name heuristic to skip embed models.
840        let resp = self
841            .client
842            .get(format!("{}/v1/models", self.base_url))
843            .send()
844            .await
845            .ok()?;
846        let list: ModelList = resp.json().await.ok()?;
847        Some(
848            list.data
849                .into_iter()
850                .find(|m| !m.id.to_lowercase().contains("embed"))
851                .map(|m| m.id)
852                .unwrap_or_default(),
853        )
854    }
855
856    /// Returns the ID of the first loaded embedding model, if any.
857    /// Uses /api/v0/models which includes `type` and `state` fields.
858    /// The OpenAI-compat /v1/models endpoint omits `type` so cannot be used here.
859    /// Accepts any non-empty state (not just "loaded") to handle LM Studio variants
860    /// where the embed model may report a different state string at startup.
861    pub async fn get_embedding_model(&self) -> Option<String> {
862        #[derive(Deserialize)]
863        struct ModelList {
864            data: Vec<ModelEntry>,
865        }
866        #[derive(Deserialize)]
867        struct ModelEntry {
868            id: String,
869            #[serde(rename = "type", default)]
870            model_type: String,
871            #[serde(default)]
872            state: String,
873        }
874        let resp = self
875            .client
876            .get(format!("{}/api/v0/models", self.base_url))
877            .send()
878            .await
879            .ok()?;
880        let list: ModelList = resp.json().await.ok()?;
881        list.data
882            .into_iter()
883            .find(|m| m.model_type == "embeddings" && m.state == "loaded")
884            .map(|m| m.id)
885    }
886
887    /// Detect the loaded model's context window size.
888    /// Tries LM Studio's `/api/v0/models` endpoint first and prefers the loaded
889    /// model's live `loaded_context_length`, then falls back to older
890    /// `context_length` / `max_context_length` style fields.
891    /// Falls back to a heuristic from the model name, then 32K.
892    pub async fn detect_context_length(&self) -> usize {
893        #[derive(Deserialize)]
894        struct LmStudioModel {
895            id: Option<String>,
896            #[serde(rename = "type", default)]
897            model_type: String,
898            state: Option<String>,
899            loaded_context_length: Option<u64>,
900            context_length: Option<u64>,
901            max_context_length: Option<u64>,
902        }
903        #[derive(Deserialize)]
904        struct LmStudioList {
905            data: Vec<LmStudioModel>,
906        }
907
908        // Check api/v0/models (LM Studio specific)
909        if let Ok(resp) = self
910            .client
911            .get(format!("{}/api/v0/models", self.base_url))
912            .send()
913            .await
914        {
915            if let Ok(list) = resp.json::<LmStudioList>().await {
916                let target_model = self.current_model().to_ascii_lowercase();
917                // Never select embedding models for context-length detection.
918                let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
919                let loaded = list
920                    .data
921                    .iter()
922                    .find(|m| {
923                        non_embed(m)
924                            && m.state.as_deref() == Some("loaded")
925                            && m.id
926                                .as_deref()
927                                .map(|id| id.eq_ignore_ascii_case(&target_model))
928                                .unwrap_or(false)
929                    })
930                    .or_else(|| {
931                        list.data
932                            .iter()
933                            .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
934                    })
935                    .or_else(|| {
936                        list.data.iter().find(|m| {
937                            non_embed(m)
938                                && m.id
939                                    .as_deref()
940                                    .map(|id| id.eq_ignore_ascii_case(&target_model))
941                                    .unwrap_or(false)
942                        })
943                    })
944                    .or_else(|| list.data.iter().find(|m| non_embed(m)));
945
946                if let Some(model) = loaded {
947                    if let Some(ctx) = model.loaded_context_length {
948                        if ctx > 0 {
949                            return ctx as usize;
950                        }
951                    }
952                    if let Some(ctx) = model.context_length {
953                        if ctx > 0 {
954                            return ctx as usize;
955                        }
956                    }
957                    if let Some(ctx) = model.max_context_length {
958                        if ctx > 0 && ctx <= 32_768 {
959                            return ctx as usize;
960                        }
961                    }
962                }
963            }
964        }
965
966        // Heuristic fallback:
967        // If "gemma-4" is detected, we target 32,768 as the baseline standard,
968        // acknowledging that 131,072 is available for High-Capacity tasks.
969        if self.current_model().to_lowercase().contains("gemma-4") {
970            return 32_768;
971        }
972
973        32_768
974    }
975
976    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
977        let previous_model = self.current_model();
978        let previous_context = self.current_context_length();
979
980        let detected_model = match self.get_loaded_model().await {
981            Some(m) if !m.is_empty() => m,            // coding model found
982            Some(_) => "no model loaded".to_string(), // reachable but no coding model
983            None => previous_model.clone(),           // LM Studio offline
984        };
985
986        if !detected_model.is_empty() && detected_model != previous_model {
987            if let Ok(mut guard) = self.model.write() {
988                *guard = detected_model.clone();
989            }
990        }
991
992        let detected_context = self.detect_context_length().await;
993        let effective_model = if detected_model.is_empty() {
994            previous_model.clone()
995        } else {
996            detected_model
997        };
998
999        let changed = effective_model != previous_model || detected_context != previous_context;
1000        self.set_runtime_profile(&effective_model, detected_context);
1001
1002        Some((effective_model, detected_context, changed))
1003    }
1004
1005    pub fn build_system_prompt(
1006        &self,
1007        snark: u8,
1008        chaos: u8,
1009        brief: bool,
1010        professional: bool,
1011        tools: &[ToolDefinition],
1012        reasoning_history: Option<&str>,
1013        mcp_tools: &[crate::agent::mcp::McpTool],
1014    ) -> String {
1015        let mut sys = self.build_system_prompt_legacy(
1016            snark,
1017            chaos,
1018            brief,
1019            professional,
1020            tools,
1021            reasoning_history,
1022        );
1023
1024        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1025            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1026            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1027            for tool in mcp_tools {
1028                let description = tool
1029                    .description
1030                    .as_deref()
1031                    .unwrap_or("No description provided.");
1032                sys.push_str(&format!("- {}: {}\n", tool.name, description));
1033            }
1034        }
1035
1036        sys
1037    }
1038
1039    pub fn build_system_prompt_legacy(
1040        &self,
1041        snark: u8,
1042        _chaos: u8,
1043        brief: bool,
1044        professional: bool,
1045        tools: &[ToolDefinition],
1046        reasoning_history: Option<&str>,
1047    ) -> String {
1048        let current_context_length = self.current_context_length();
1049        if is_tiny_context_window(current_context_length) {
1050            return self.build_system_prompt_tiny(brief, professional);
1051        }
1052        if is_compact_context_window(current_context_length) {
1053            return self.build_system_prompt_compact(brief, professional, tools);
1054        }
1055
1056        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
1057        let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1058                                     - You are Hematite, a local coding system working on the user's machine.\n\
1059                                     - The running Hematite build is ");
1060        sys.push_str(&crate::hematite_version_display());
1061        sys.push_str(".\n\
1062                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1063                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1064                                     - For simple questions, answer briefly in plain language.\n\
1065                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1066                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1067                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1068                                     - Keep internal reasoning inside channel delimiters.\n\
1069                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1070                                     <turn|>\n\n");
1071
1072        if let Some(history) = reasoning_history {
1073            if !history.is_empty() {
1074                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1075                sys.push_str(history);
1076                sys.push_str("\n\n");
1077            }
1078        }
1079
1080        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
1081        if brief {
1082            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1083                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1084                          - Depth: Surface-level verification only.\n\n");
1085        } else {
1086            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1087                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1088                          - Depth: Full multi-step derivation required.\n\n");
1089        }
1090
1091        // IDENTITY & ENVIRONMENT
1092        let os = std::env::consts::OS;
1093        if professional {
1094            sys.push_str(&format!(
1095                "You are Hematite, a local coding system running on {}. \
1096                 The TUI is one interface layer, not your whole identity. \
1097                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1098                 Skip filler and keep the focus on the work.\n",
1099                os
1100            ));
1101        } else {
1102            sys.push_str(&format!(
1103                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1104                 The terminal UI is only one surface of the system. \
1105                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1106                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1107                self.species, snark, os
1108            ));
1109        }
1110
1111        // Inject loaded model and context window so the model knows its own budget.
1112        let current_model = self.current_model();
1113        if !current_model.is_empty() {
1114            sys.push_str(&format!(
1115                "Loaded model: {} | Context window: {} tokens. \
1116                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1117                current_model, current_context_length
1118            ));
1119            if is_gemma4_model_name(&current_model) {
1120                sys.push_str(
1121                    "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1122                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1123                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1124                );
1125            }
1126        } else {
1127            sys.push_str(&format!(
1128                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1129                current_context_length
1130            ));
1131        }
1132
1133        // PROTOCOL & TOOLS
1134        let shell_desc = if cfg!(target_os = "windows") {
1135            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1136             - Use ONLY for builds, tests, or file migrations. \n\
1137             - You MUST use the `powershell` tool directly. \n\
1138             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1139        } else {
1140            "[EXTERNAL SHELL]: `bash` (Unix).\n\
1141             - Use ONLY for builds, tests, or file migrations. \n\
1142             - NEVER wrap bash in other shells. \n\n"
1143        };
1144
1145        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1146                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1147                      - These are the ONLY way to explore and modify code. \n\
1148                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1149        sys.push_str(shell_desc);
1150
1151        // ANTI-LOOPING & SELF-AUDIT
1152        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1153                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1154
1155        if brief {
1156            sys.push_str(
1157                "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1158            );
1159        }
1160
1161        if cfg!(target_os = "windows") {
1162            sys.push_str("Shell Protocol: You are running on WINDOWS. You MUST NOT use 'bash' or '/dev/null'. \
1163                          You MUST use 'powershell' (pwsh) for all shell tasks. \
1164                          DO NOT attempt to manipulate Linux-style paths like /dev, /etc, or /sys.\n\n");
1165        } else if cfg!(target_os = "macos") {
1166            sys.push_str(
1167                "Shell Protocol: You are running on macOS. Use 'bash' or 'zsh' for shell tasks. \
1168                          Standard Unix paths apply.\n\n",
1169            );
1170        } else {
1171            sys.push_str(
1172                "Shell Protocol: You are running on Linux. Use 'bash' for shell tasks. \
1173                          Standard Unix paths apply.\n\n",
1174            );
1175        }
1176
1177        sys.push_str("OUTPUT RULES:\n\
1178                      1. Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1179                      2. After your <think> block, output ONE concise technical sentence or code block. Nothing else.\n\
1180                      3. Do NOT call tools named 'thought', 'think', 'reasoning', or any meta-cognitive name. These are not tools.\n\
1181                      4. NEGATIVE CONSTRAINT: Never use a string containing a dot (.), slash (/), or backslash (\\) as a tool name. Paths are NOT tools.\n\
1182                      5. NEGATIVE CONSTRAINT: Never use the name of a class, struct, or module as a tool name unless it is explicitly in the tool list.\n\
1183                      6. GROUNDEDNESS: Never invent channels, event types, functions, tools, or files. If a detail is not verified from the repo or tool output, say `uncertain`.\n\
1184                      7. TRACE QUESTIONS: For architecture or control-flow questions, prefer verified file and function names over high-level summaries.\n\
1185                      8. If `trace_runtime_flow` fully answers the runtime question, preserve its identifiers exactly. Do not restyle or rename symbols from that tool output.\n\
1186                      9. For generic capability questions, answer from stable Hematite capabilities. Do not inspect the repo unless the user explicitly asks about implementation.\n\
1187                      10. Never infer language support, project support, or internet capability from unrelated crates or config files.\n\
1188                      11. It is fine to say Hematite itself is written in Rust when relevant, but do not imply that capability is limited to Rust projects.\n\
1189                      12. For language questions, answer at the harness level: file operations, shell, build verification, language-aware tooling when available, and multi-language project work.\n\
1190                      13. Prefer real programming language examples like Python, JavaScript, TypeScript, Go, and C# over file extensions when answering language questions.\n\
1191                      14. For project-building questions, talk about scaffolding, implementation, builds, tests, and iteration across different stacks instead of defaulting to a Rust-only example like `cargo build`.\n\
1192                      15. Never mention raw `mcp__*` tool names unless those tools are active this turn and directly relevant.\n\
1193                      16. For tooling-discipline or best-tool-selection questions, prefer `describe_toolchain` over improvising the tool surface from memory.\n\
1194                      17. If `describe_toolchain` fully answers the tooling question, preserve its tool names and investigation order exactly.\n\
1195                      18. PROOF BEFORE ACTION: Before editing an existing file, gather recent evidence with `read_file` or `inspect_lines` on that path or keep it pinned in active context.\n\
1196                      18a. GREP BEFORE READ: For files over ~200 lines, always `grep_files` for a specific pattern to find the target line range BEFORE calling `read_file`. Never read a large file top-to-bottom — use offset+limit to read only the relevant window once grep gives you the line number.\n\
1197                      19. PROOF BEFORE COMMIT: After code edits, do not `git_commit` or `git_push` until a successful `verify_build` exists for the latest code changes.\n\
1198                      20. RISKY SHELL DISCIPLINE: Risky `shell` calls must include a concrete `reason` argument explaining what is being verified or changed.\n\
1199                      21. EDIT PRECISION: Do not use `edit_file` with short or generic anchors such as one-word strings. Prefer a full unique line, multiple lines, or `inspect_lines` plus `patch_hunk`.\n\
1200                      22. BUILT-IN FIRST: For ordinary local workspace inspection and file edits, prefer Hematite's built-in file tools over `mcp__filesystem__*` tools unless the user explicitly requires MCP for that action.\n\
1201                      22a. HOST INSPECTION PRIORITY: For read-only questions about installed tools, PATH entries, desktop items, Downloads size, or directory summaries, prefer `inspect_host` over raw `shell` when it can answer directly.");
1202
1203        // Scaffolding protocol — enforces build validation after project creation.
1204        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1205            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1206            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1207            4. Fix all errors before declaring success.\n\n\
1208            ## PRE-FLIGHT SCOPING PROTOCOL\n\
1209            Before attempting any multi-file task or complex refactor:\n\
1210            1. Use `map_project` to understand the project structure.\n\
1211            2. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1212            3. Use `auto_pin_context` to keep those files in active context.\n\
1213            4. Only then proceed to deeper edits or research.\n\n\
1214            ## REFACTORING PROTOCOL\n\
1215            When modifying existing code or renaming symbols:\n\
1216            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1217            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1218            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1219
1220        // Inject CLAUDE.md / instruction files from the project directory.
1221        sys.push_str(&load_instruction_files());
1222
1223        // Inject cross-session memories synthesized by DeepReflect.
1224        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1225
1226        // Native Gemma-4 Tool Declarations
1227        if !tools.is_empty() {
1228            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1229            for tool in tools {
1230                let schema = serde_json::to_string(&tool.function.parameters)
1231                    .unwrap_or_else(|_| "{}".to_string());
1232                sys.push_str(&format!(
1233                    "<|tool>declaration:{}{}{}<tool|>\n",
1234                    tool.function.name, "{", schema
1235                ));
1236                sys.push_str(&format!("// {})\n", tool.function.description));
1237            }
1238        }
1239
1240        sys
1241    }
1242
1243    fn build_system_prompt_compact(
1244        &self,
1245        brief: bool,
1246        professional: bool,
1247        tools: &[ToolDefinition],
1248    ) -> String {
1249        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
1250        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
1251        let current_model = self.current_model();
1252        let current_context_length = self.current_context_length();
1253        let os = std::env::consts::OS;
1254
1255        let mut sys = String::from("<|turn>system\n<|think|>\n");
1256        sys.push_str(&format!(
1257            "You are Hematite {}, a local coding harness working on the user's machine.\n",
1258            crate::hematite_version_display()
1259        ));
1260        if professional {
1261            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1262        } else {
1263            sys.push_str(&format!(
1264                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1265                self.species
1266            ));
1267        }
1268        sys.push_str(&format!(
1269            "Model: {} | Context: {} tokens. Keep turns focused.\n",
1270            current_model, current_context_length
1271        ));
1272        if is_gemma4_model_name(&current_model) {
1273            sys.push_str(
1274                "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1275                 Raw regex patterns in grep_files, no slash delimiters.\n",
1276            );
1277        }
1278        if cfg!(target_os = "windows") {
1279            sys.push_str(&format!(
1280                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1281                os
1282            ));
1283        } else {
1284            sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1285        }
1286        if brief {
1287            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1288        }
1289
1290        sys.push_str(
1291            "\nCORE RULES:\n\
1292             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1293             - Verify after edits: run `verify_build` after code changes, before committing.\n\
1294             - One tool at a time. Do not batch unrelated tool calls.\n\
1295             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1296             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1297             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1298        );
1299
1300        if !tools.is_empty() {
1301            sys.push_str("\n# AVAILABLE TOOLS\n");
1302            for tool in tools {
1303                let desc: String = tool.function.description.chars().take(120).collect();
1304                sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1305            }
1306        }
1307
1308        sys.push_str("<turn|>\n");
1309        sys
1310    }
1311
1312    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1313        let current_model = self.current_model();
1314        let current_context_length = self.current_context_length();
1315        let os = std::env::consts::OS;
1316        let mut sys = format!(
1317            "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1318            crate::hematite_version_display()
1319        );
1320        if professional {
1321            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1322        } else {
1323            sys.push_str(&format!(
1324                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1325                self.species
1326            ));
1327        }
1328        if !current_model.is_empty() {
1329            sys.push_str(&format!(
1330                "Loaded model: {} | Context window: {} tokens.\n",
1331                current_model, current_context_length
1332            ));
1333        } else {
1334            sys.push_str(&format!(
1335                "Context window: {} tokens.\n",
1336                current_context_length
1337            ));
1338        }
1339        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1340        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1341        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1342        if cfg!(target_os = "windows") {
1343            sys.push_str(&format!(
1344                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1345                os
1346            ));
1347        } else {
1348            sys.push_str(&format!(
1349                "You are running on {}. Use the native Unix shell conventions.\n",
1350                os
1351            ));
1352        }
1353        if brief {
1354            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1355        }
1356        if is_gemma4_model_name(&current_model) {
1357            sys.push_str(
1358                "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1359            );
1360        }
1361        sys.push_str("<turn|>\n");
1362        sys
1363    }
1364
1365    // ── Non-streaming call (used for agentic turns with tool support) ─────────
1366
1367    /// Send messages to the model. Returns (text_content, tool_calls).
1368    /// Exactly one of the two will be Some on a successful response.
1369    pub async fn call_with_tools(
1370        &self,
1371        messages: &[ChatMessage],
1372        tools: &[ToolDefinition],
1373        // Override the model ID for this call. None = use the live runtime model.
1374        model_override: Option<&str>,
1375    ) -> Result<
1376        (
1377            Option<String>,
1378            Option<Vec<ToolCallResponse>>,
1379            Option<TokenUsage>,
1380            Option<String>,
1381        ),
1382        String,
1383    > {
1384        let _permit = self
1385            .kv_semaphore
1386            .acquire()
1387            .await
1388            .map_err(|e| e.to_string())?;
1389
1390        let current_model = self.current_model();
1391        let model = model_override.unwrap_or(current_model.as_str()).to_string();
1392        let filtered_tools = if cfg!(target_os = "windows") {
1393            tools
1394                .iter()
1395                .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1396                .cloned()
1397                .collect::<Vec<_>>()
1398        } else {
1399            tools.to_vec()
1400        };
1401
1402        let request_messages = if should_use_gemma_native_formatting(self, &model) {
1403            prepare_gemma_native_messages(messages)
1404        } else {
1405            messages.to_vec()
1406        };
1407
1408        // In compact context windows, restrict tools to the core coding set.
1409        // Full schemas for 36+ tools add 10k+ tokens via the model's chat template (e.g. Gemma 4).
1410        // Sending a small core set keeps schemas available for structured tool-call dispatch
1411        // while staying within the 16k budget.
1412        const COMPACT_CORE_TOOLS: &[&str] = &[
1413            "read_file",
1414            "inspect_lines",
1415            "edit_file",
1416            "write_file",
1417            "grep_files",
1418            "list_files",
1419            "verify_build",
1420            "shell",
1421            "map_project",
1422        ];
1423        let effective_tools = if is_compact_context_window(self.current_context_length()) {
1424            let core: Vec<_> = filtered_tools
1425                .iter()
1426                .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1427                .cloned()
1428                .collect();
1429            if core.is_empty() {
1430                None
1431            } else {
1432                Some(core)
1433            }
1434        } else if filtered_tools.is_empty() {
1435            None
1436        } else {
1437            Some(filtered_tools)
1438        };
1439
1440        let request = ChatRequest {
1441            model: model.clone(),
1442            messages: request_messages,
1443            temperature: 0.2,
1444            stream: false,
1445            tools: effective_tools,
1446        };
1447
1448        // Exponential backoff: retry up to 3× on 5xx / timeout / connect errors.
1449        preflight_chat_request(
1450            &model,
1451            &request.messages,
1452            request.tools.as_deref().unwrap_or(&[]),
1453            self.current_context_length(),
1454        )?;
1455
1456        let mut last_err = String::new();
1457        let mut response_opt: Option<reqwest::Response> = None;
1458        for attempt in 0..3u32 {
1459            match self.client.post(&self.api_url).json(&request).send().await {
1460                Ok(res) if res.status().is_success() => {
1461                    response_opt = Some(res);
1462                    break;
1463                }
1464                Ok(res) if res.status().as_u16() >= 500 => {
1465                    last_err = format!("LM Studio error {}", res.status());
1466                }
1467                Ok(res) => {
1468                    // 4xx — don't retry
1469                    let status = res.status();
1470                    let body = res.text().await.unwrap_or_default();
1471                    let preview = &body[..body.len().min(300)];
1472                    return Err(format!("LM Studio error {}: {}", status, preview));
1473                }
1474                Err(e) if e.is_timeout() || e.is_connect() => {
1475                    last_err = format!("Request failed: {}", e);
1476                }
1477                Err(e) => return Err(format!("Request failed: {}", e)),
1478            }
1479            if attempt < 2 {
1480                let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1481                tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1482            }
1483        }
1484        let res = response_opt
1485            .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1486
1487        let body: ChatResponse = res
1488            .json()
1489            .await
1490            .map_err(|e| format!("Response parse error: {}", e))?;
1491
1492        if let Some(usage) = &body.usage {
1493            let mut econ = self.economics.lock().unwrap();
1494            econ.input_tokens += usage.prompt_tokens;
1495            econ.output_tokens += usage.completion_tokens;
1496        }
1497
1498        let choice = body
1499            .choices
1500            .into_iter()
1501            .next()
1502            .ok_or_else(|| "Empty response from model".to_string())?;
1503
1504        let finish_reason = choice.finish_reason;
1505        let mut tool_calls = choice.message.tool_calls;
1506        let mut content = choice.message.content;
1507
1508        // Gemma-4 Fallback: If the model outputs native <|tool_call|> tags in the text content,
1509        // extract them and treat them as valid tool calls.
1510        if let Some(raw_content) = &content {
1511            let native_calls = extract_native_tool_calls(raw_content);
1512            if !native_calls.is_empty() {
1513                let mut existing = tool_calls.unwrap_or_default();
1514                existing.extend(native_calls);
1515                tool_calls = Some(existing);
1516                let stripped = strip_native_tool_call_text(raw_content);
1517                content = if stripped.trim().is_empty() {
1518                    None
1519                } else {
1520                    Some(stripped)
1521                };
1522            }
1523        }
1524
1525        if is_gemma4_model_name(&model) {
1526            if let Some(calls) = tool_calls.as_mut() {
1527                for call in calls.iter_mut() {
1528                    call.function.arguments = normalize_tool_argument_string(
1529                        &call.function.name,
1530                        &call.function.arguments,
1531                    );
1532                }
1533            }
1534        }
1535
1536        Ok((content, tool_calls, body.usage, finish_reason))
1537    }
1538
1539    // ── Streaming call (used for plain-text responses) ────────────────────────
1540
1541    /// Stream a conversation (no tools). Emits Token/Done/Error events.
1542    pub async fn stream_messages(
1543        &self,
1544        messages: &[ChatMessage],
1545        tx: mpsc::Sender<InferenceEvent>,
1546    ) -> Result<(), Box<dyn std::error::Error>> {
1547        let current_model = self.current_model();
1548        let request_messages = if should_use_gemma_native_formatting(self, &current_model) {
1549            prepare_gemma_native_messages(messages)
1550        } else {
1551            messages
1552                .iter()
1553                .map(|m| {
1554                    let mut clone = m.clone();
1555                    let current_text = m.content.as_str();
1556                    if !current_text.starts_with("<|turn>") {
1557                        clone.content = MessageContent::Text(format!(
1558                            "<|turn>{}\n{}\n<turn|>",
1559                            m.role, current_text
1560                        ));
1561                    }
1562                    clone
1563                })
1564                .collect()
1565        };
1566
1567        let request = ChatRequest {
1568            model: current_model.clone(),
1569            messages: request_messages,
1570            temperature: 0.7,
1571            stream: true,
1572            tools: None,
1573        };
1574
1575        if let Err(e) = preflight_chat_request(
1576            &current_model,
1577            &request.messages,
1578            &[],
1579            self.current_context_length(),
1580        ) {
1581            let tag = classify_runtime_failure_tag(&e);
1582            let _ = tx
1583                .send(InferenceEvent::ProviderStatus {
1584                    state: provider_state_for_failure_tag(tag),
1585                    summary: compact_runtime_failure_summary(tag, &e),
1586                })
1587                .await;
1588            let _ = tx
1589                .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1590                .await;
1591            let _ = tx.send(InferenceEvent::Done).await;
1592            return Ok(());
1593        }
1594
1595        let mut last_err = String::new();
1596        let mut response_opt: Option<reqwest::Response> = None;
1597        for attempt in 0..2u32 {
1598            match self.client.post(&self.api_url).json(&request).send().await {
1599                Ok(res) if res.status().is_success() => {
1600                    response_opt = Some(res);
1601                    break;
1602                }
1603                Ok(res) if res.status().as_u16() >= 500 => {
1604                    last_err = format!("LM Studio error {}", res.status());
1605                }
1606                Ok(res) => {
1607                    let status = res.status();
1608                    let body = res.text().await.unwrap_or_default();
1609                    let preview = &body[..body.len().min(300)];
1610                    let detail = format!("LM Studio error {}: {}", status, preview);
1611                    let tag = classify_runtime_failure_tag(&detail);
1612                    let _ = tx
1613                        .send(InferenceEvent::ProviderStatus {
1614                            state: provider_state_for_failure_tag(tag),
1615                            summary: compact_runtime_failure_summary(tag, &detail),
1616                        })
1617                        .await;
1618                    let _ = tx
1619                        .send(InferenceEvent::Error(format_runtime_failure_message(
1620                            &detail,
1621                        )))
1622                        .await;
1623                    let _ = tx.send(InferenceEvent::Done).await;
1624                    return Ok(());
1625                }
1626                Err(e) if e.is_timeout() || e.is_connect() => {
1627                    last_err = format!("Request failed: {}", e);
1628                }
1629                Err(e) => {
1630                    let detail = format!("Request failed: {}", e);
1631                    let tag = classify_runtime_failure_tag(&detail);
1632                    let _ = tx
1633                        .send(InferenceEvent::ProviderStatus {
1634                            state: provider_state_for_failure_tag(tag),
1635                            summary: compact_runtime_failure_summary(tag, &detail),
1636                        })
1637                        .await;
1638                    let _ = tx
1639                        .send(InferenceEvent::Error(format_runtime_failure_message(
1640                            &detail,
1641                        )))
1642                        .await;
1643                    let _ = tx.send(InferenceEvent::Done).await;
1644                    return Ok(());
1645                }
1646            }
1647            if attempt < 1 {
1648                let _ = tx
1649                    .send(InferenceEvent::ProviderStatus {
1650                        state: ProviderRuntimeState::Recovering,
1651                        summary: "LM Studio degraded during stream startup; retrying once.".into(),
1652                    })
1653                    .await;
1654                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1655            }
1656        }
1657        let Some(res) = response_opt else {
1658            let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1659            let tag = classify_runtime_failure_tag(&detail);
1660            let _ = tx
1661                .send(InferenceEvent::ProviderStatus {
1662                    state: provider_state_for_failure_tag(tag),
1663                    summary: compact_runtime_failure_summary(tag, &detail),
1664                })
1665                .await;
1666            let _ = tx
1667                .send(InferenceEvent::Error(format_runtime_failure_message(
1668                    &detail,
1669                )))
1670                .await;
1671            let _ = tx.send(InferenceEvent::Done).await;
1672            return Ok(());
1673        };
1674
1675        use futures::StreamExt;
1676        let mut byte_stream = res.bytes_stream();
1677
1678        // [Collaborative Strategy] TokenBuffer refactor suggested by Hematite local agent.
1679        // Aggregates tokens to ensure coherent linguistic chunks for UI/Voice.
1680        let mut line_buffer = String::new();
1681        let mut content_buffer = String::new();
1682        let mut past_think = false;
1683        let mut emitted_any_content = false;
1684        let mut emitted_live_status = false;
1685
1686        while let Some(item) = byte_stream.next().await {
1687            // Rapid hardware interrupt check
1688            if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1689                break;
1690            }
1691
1692            let chunk = match item {
1693                Ok(chunk) => chunk,
1694                Err(e) => {
1695                    let detail = format!("Request failed: {}", e);
1696                    let tag = classify_runtime_failure_tag(&detail);
1697                    let _ = tx
1698                        .send(InferenceEvent::ProviderStatus {
1699                            state: provider_state_for_failure_tag(tag),
1700                            summary: compact_runtime_failure_summary(tag, &detail),
1701                        })
1702                        .await;
1703                    let _ = tx
1704                        .send(InferenceEvent::Error(format_runtime_failure_message(
1705                            &detail,
1706                        )))
1707                        .await;
1708                    let _ = tx.send(InferenceEvent::Done).await;
1709                    return Ok(());
1710                }
1711            };
1712            line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1713
1714            while let Some(pos) = line_buffer.find("\n\n") {
1715                let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1716                let data_pos = match event_str.find("data: ") {
1717                    Some(p) => p,
1718                    None => continue,
1719                };
1720
1721                let data = event_str[data_pos + 6..].trim();
1722                if data == "[DONE]" {
1723                    break;
1724                }
1725
1726                if let Ok(json) = serde_json::from_str::<Value>(data) {
1727                    if let Some(content) = json["choices"][0]["delta"]["content"].as_str() {
1728                        if content.is_empty() {
1729                            continue;
1730                        }
1731
1732                        if !past_think {
1733                            let lc = content.to_lowercase();
1734                            let close = lc
1735                                .find("<channel|>")
1736                                .map(|i| (i, "<channel|>".len()))
1737                                .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1738
1739                            if let Some((tag_start, tag_len)) = close {
1740                                // Flush any existing thought buffer
1741                                let before = &content[..tag_start];
1742                                content_buffer.push_str(before);
1743                                if !content_buffer.trim().is_empty() {
1744                                    let _ = tx
1745                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1746                                        .await;
1747                                    emitted_any_content = true;
1748                                }
1749                                content_buffer.clear();
1750
1751                                past_think = true;
1752                                let after = content[tag_start + tag_len..].trim_start_matches('\n');
1753                                content_buffer.push_str(after);
1754                            } else {
1755                                // Still in reasoning block
1756                                content_buffer.push_str(content);
1757                                // Heuristic: Flush thoughts on paragraph/sentence breaks for SPECULAR
1758                                if content_buffer.len() > 30
1759                                    && (content.contains('\n') || content.contains('.'))
1760                                {
1761                                    let _ = tx
1762                                        .send(InferenceEvent::Thought(content_buffer.clone()))
1763                                        .await;
1764                                    emitted_any_content = true;
1765                                    content_buffer.clear();
1766                                }
1767                            }
1768                        } else {
1769                            // PAST THINK: final answer tokens.
1770                            // [Linguistic Buffering] Aggregate into content_buffer until a boundary is hit.
1771                            content_buffer.push_str(content);
1772                            let is_boundary = content.contains(' ')
1773                                || content.contains('.')
1774                                || content.contains('!')
1775                                || content.contains('?');
1776
1777                            if content_buffer.len() > 10 && is_boundary {
1778                                if !emitted_live_status {
1779                                    let _ = tx
1780                                        .send(InferenceEvent::ProviderStatus {
1781                                            state: ProviderRuntimeState::Live,
1782                                            summary: String::new(),
1783                                        })
1784                                        .await;
1785                                    emitted_live_status = true;
1786                                }
1787                                let _ =
1788                                    tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1789                                emitted_any_content = true;
1790                                content_buffer.clear();
1791                            }
1792                        }
1793                    }
1794                }
1795            }
1796        }
1797
1798        // Final Flush
1799        if !content_buffer.is_empty() {
1800            if past_think {
1801                if !emitted_live_status {
1802                    let _ = tx
1803                        .send(InferenceEvent::ProviderStatus {
1804                            state: ProviderRuntimeState::Live,
1805                            summary: String::new(),
1806                        })
1807                        .await;
1808                }
1809                let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1810            } else {
1811                let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1812            }
1813            emitted_any_content = true;
1814        }
1815
1816        if !emitted_any_content {
1817            let _ = tx
1818                .send(InferenceEvent::ProviderStatus {
1819                    state: ProviderRuntimeState::EmptyResponse,
1820                    summary: compact_runtime_failure_summary(
1821                        "empty_model_response",
1822                        "Empty response from model",
1823                    ),
1824                })
1825                .await;
1826            let _ = tx
1827                .send(InferenceEvent::Error(format_runtime_failure_message(
1828                    "Empty response from model",
1829                )))
1830                .await;
1831            let _ = tx.send(InferenceEvent::Done).await;
1832            return Ok(());
1833        }
1834
1835        let _ = tx.send(InferenceEvent::Done).await;
1836        Ok(())
1837    }
1838
1839    /// Single-turn streaming (legacy helper used by startup sequence).
1840    pub async fn stream_generation(
1841        &self,
1842        prompt: &str,
1843        snark: u8,
1844        chaos: u8,
1845        brief: bool,
1846        professional: bool,
1847        tx: mpsc::Sender<InferenceEvent>,
1848    ) -> Result<(), Box<dyn std::error::Error>> {
1849        let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1850        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1851        self.stream_messages(&messages, tx).await
1852    }
1853
1854    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
1855
1856    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
1857    pub async fn generate_task_worker(
1858        &self,
1859        prompt: &str,
1860        professional: bool,
1861    ) -> Result<String, String> {
1862        let current_model = self.current_model();
1863        let model = self
1864            .worker_model
1865            .as_deref()
1866            .unwrap_or(current_model.as_str());
1867        self.generate_task_with_model(prompt, 0.1, professional, model)
1868            .await
1869    }
1870
1871    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1872        self.generate_task_with_temp(prompt, 0.1, professional)
1873            .await
1874    }
1875
1876    pub async fn generate_task_with_temp(
1877        &self,
1878        prompt: &str,
1879        temp: f32,
1880        professional: bool,
1881    ) -> Result<String, String> {
1882        let current_model = self.current_model();
1883        self.generate_task_with_model(prompt, temp, professional, &current_model)
1884            .await
1885    }
1886
1887    pub async fn generate_task_with_model(
1888        &self,
1889        prompt: &str,
1890        temp: f32,
1891        professional: bool,
1892        model: &str,
1893    ) -> Result<String, String> {
1894        let _permit = self
1895            .kv_semaphore
1896            .acquire()
1897            .await
1898            .map_err(|e| e.to_string())?;
1899
1900        let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1901        let request_messages = if should_use_gemma_native_formatting(self, model) {
1902            prepare_gemma_native_messages(&[
1903                ChatMessage::system(&system),
1904                ChatMessage::user(prompt),
1905            ])
1906        } else {
1907            vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1908        };
1909        let request = ChatRequest {
1910            model: model.to_string(),
1911            messages: request_messages,
1912            temperature: temp,
1913            stream: false,
1914            tools: None,
1915        };
1916
1917        preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1918
1919        let res = self
1920            .client
1921            .post(&self.api_url)
1922            .json(&request)
1923            .send()
1924            .await
1925            .map_err(|e| format!("LM Studio request failed: {}", e))?;
1926
1927        let body: ChatResponse = res
1928            .json()
1929            .await
1930            .map_err(|e| format!("Failed to parse response: {}", e))?;
1931
1932        body.choices
1933            .first()
1934            .and_then(|c| c.message.content.clone())
1935            .ok_or_else(|| "Empty response from model".to_string())
1936    }
1937
1938    // ── History management ────────────────────────────────────────────────────
1939
1940    /// Prune middle turns when context grows too large, keeping system + recent N.
1941    #[allow(dead_code)]
1942    pub fn snip_history(
1943        &self,
1944        turns: &[ChatMessage],
1945        max_tokens_estimate: usize,
1946        keep_recent: usize,
1947    ) -> Vec<ChatMessage> {
1948        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
1949        if total_chars / 4 <= max_tokens_estimate {
1950            return turns.to_vec();
1951        }
1952        let keep = keep_recent.min(turns.len());
1953        let mut snipped = vec![turns[0].clone()];
1954        if turns.len() > keep + 1 {
1955            snipped.push(ChatMessage::system(&format!(
1956                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
1957                turns.len() - keep - 1
1958            )));
1959            snipped.extend_from_slice(&turns[turns.len() - keep..]);
1960        } else {
1961            snipped = turns.to_vec();
1962        }
1963        snipped
1964    }
1965}
1966
1967fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
1968    serde_json::to_vec(value)
1969        .ok()
1970        .map_or(0, |bytes| bytes.len() / 4 + 1)
1971}
1972
1973const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
1974
1975fn estimate_message_tokens(message: &ChatMessage) -> usize {
1976    let content_tokens = match &message.content {
1977        MessageContent::Text(s) => s.len() / 4 + 1,
1978        MessageContent::Parts(parts) => parts
1979            .iter()
1980            .map(|part| match part {
1981                ContentPart::Text { text } => text.len() / 4 + 1,
1982                // Image payloads are transported as data URLs, but their base64
1983                // length should not be treated like plain text context pressure.
1984                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
1985            })
1986            .sum(),
1987    };
1988    let tool_tokens: usize = message
1989        .tool_calls
1990        .iter()
1991        .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
1992        .sum();
1993    content_tokens + tool_tokens + 6
1994}
1995
1996pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
1997    messages.iter().map(estimate_message_tokens).sum()
1998}
1999
2000fn reserved_output_tokens(context_length: usize) -> usize {
2001    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2002    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2003}
2004
2005pub fn estimate_prompt_pressure(
2006    messages: &[ChatMessage],
2007    tools: &[ToolDefinition],
2008    context_length: usize,
2009) -> (usize, usize, usize, u8) {
2010    let estimated_input_tokens =
2011        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2012    let reserved_output = reserved_output_tokens(context_length);
2013    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2014    let percent = if context_length == 0 {
2015        0
2016    } else {
2017        ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2018    };
2019    (
2020        estimated_input_tokens,
2021        reserved_output,
2022        estimated_total,
2023        percent,
2024    )
2025}
2026
2027fn preflight_chat_request(
2028    model: &str,
2029    messages: &[ChatMessage],
2030    tools: &[ToolDefinition],
2031    context_length: usize,
2032) -> Result<(), String> {
2033    let (estimated_input_tokens, reserved_output, estimated_total, _) =
2034        estimate_prompt_pressure(messages, tools, context_length);
2035
2036    if estimated_total > context_length {
2037        return Err(format!(
2038            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2039            model, estimated_input_tokens, reserved_output, estimated_total, context_length
2040        ));
2041    }
2042
2043    Ok(())
2044}
2045
2046/// Walk from CWD up to 4 parent directories and collect instruction files.
2047/// Looks for CLAUDE.md, CLAUDE.local.md, and .hematite/instructions.md.
2048/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
2049fn load_instruction_files() -> String {
2050    use std::collections::hash_map::DefaultHasher;
2051    use std::collections::HashSet;
2052    use std::hash::{Hash, Hasher};
2053
2054    let Ok(cwd) = std::env::current_dir() else {
2055        return String::new();
2056    };
2057    let mut result = String::new();
2058    let mut seen: HashSet<u64> = HashSet::new();
2059    let mut total_chars: usize = 0;
2060    const MAX_TOTAL: usize = 12_000;
2061    const MAX_PER_FILE: usize = 4_000;
2062
2063    let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2064
2065    let mut dir = cwd.clone();
2066    for _ in 0..4 {
2067        for name in &candidates {
2068            let path = dir.join(name);
2069            if !path.exists() {
2070                continue;
2071            }
2072            let Ok(content) = std::fs::read_to_string(&path) else {
2073                continue;
2074            };
2075            if content.trim().is_empty() {
2076                continue;
2077            }
2078
2079            let mut hasher = DefaultHasher::new();
2080            content.hash(&mut hasher);
2081            let h = hasher.finish();
2082            if !seen.insert(h) {
2083                continue;
2084            }
2085
2086            let truncated = if content.len() > MAX_PER_FILE {
2087                format!("{}...[truncated]", &content[..MAX_PER_FILE])
2088            } else {
2089                content
2090            };
2091
2092            if total_chars + truncated.len() > MAX_TOTAL {
2093                break;
2094            }
2095            total_chars += truncated.len();
2096            result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2097        }
2098        match dir.parent().map(|p| p.to_owned()) {
2099            Some(p) => dir = p,
2100            None => break,
2101        }
2102    }
2103
2104    if result.is_empty() {
2105        return String::new();
2106    }
2107    format!("\n\n# Project Instructions\n{}", result)
2108}
2109
2110pub fn extract_think_block(text: &str) -> Option<String> {
2111    let lower = text.to_lowercase();
2112
2113    // Official Gemma-4 Native Tags
2114    let open_tag = "<|channel>thought";
2115    let close_tag = "<channel|>";
2116
2117    let start_pos = lower.find(open_tag)?;
2118    let content_start = start_pos + open_tag.len();
2119
2120    let close_pos = lower[content_start..]
2121        .find(close_tag)
2122        .map(|p| content_start + p)
2123        .unwrap_or(text.len());
2124
2125    let content = text[content_start..close_pos].trim();
2126    if content.is_empty() {
2127        None
2128    } else {
2129        Some(content.to_string())
2130    }
2131}
2132
2133pub fn strip_think_blocks(text: &str) -> String {
2134    // Fast-path: strip a stray </think> the model emits at the start when it skips
2135    // the opening tag (common with Qwen after tool calls). Strip it before the lower
2136    // allocation so it can't slip through any branch below.
2137    let text = {
2138        let t = text.trim_start();
2139        if t.to_lowercase().starts_with("</think>") {
2140            &t[8..]
2141        } else {
2142            text
2143        }
2144    };
2145
2146    let lower = text.to_lowercase();
2147
2148    // Use the official Gemma-4 closing tag — answer is everything after it.
2149    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2150        let answer = text[end..]
2151            .replace("<|channel>thought", "")
2152            .replace("<channel|>", "");
2153        return answer.trim().replace("\n\n\n", "\n\n").to_string();
2154    }
2155
2156    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
2157    let first_open = [
2158        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
2159        lower.find("<think>"),
2160        lower.find("<thought>"),
2161        lower.find("<|think|>"),
2162    ]
2163    .iter()
2164    .filter_map(|&x| x)
2165    .min();
2166
2167    if let Some(start) = first_open {
2168        if start > 0 {
2169            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2170        }
2171        return String::new();
2172    }
2173
2174    // If the model outputs 'naked' reasoning without tags:
2175    // Strip leading sentences like "The user asked..." or "I should present..."
2176    // if they appear before actual answer content.
2177    let naked_reasoning_phrases: &[&str] = &[
2178        "the user asked",
2179        "the user is asking",
2180        "the user wants",
2181        "i will structure",
2182        "i should provide",
2183        "i should give",
2184        "i should avoid",
2185        "i should note",
2186        "i should focus",
2187        "i should keep",
2188        "i should respond",
2189        "i should present",
2190        "i should display",
2191        "i should show",
2192        "i need to",
2193        "i can see from",
2194        "without being overly",
2195        "let me ",
2196        "necessary information in my identity",
2197        "was computed successfully",
2198        "computed successfully",
2199    ];
2200    let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2201    if is_naked_reasoning {
2202        let lines: Vec<&str> = text.lines().collect();
2203        if !lines.is_empty() {
2204            // Skip leading lines that are themselves reasoning prose or blank.
2205            // Stop skipping at the first line that looks like real answer content.
2206            let mut start_idx = 0;
2207            for (i, line) in lines.iter().enumerate() {
2208                let l = line.to_lowercase();
2209                let is_reasoning_line =
2210                    naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2211                if is_reasoning_line {
2212                    start_idx = i + 1;
2213                } else {
2214                    break;
2215                }
2216            }
2217            if start_idx < lines.len() {
2218                return lines[start_idx..]
2219                    .join("\n")
2220                    .trim()
2221                    .replace("\n\n\n", "\n\n")
2222                    .to_string();
2223            }
2224            // Entire response was reasoning prose — return empty.
2225            return String::new();
2226        }
2227    }
2228
2229    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
2230    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
2231    let cleaned = strip_xml_tool_call_artifacts(text);
2232    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2233}
2234
2235/// Remove stray XML tool-call closing/opening tags that local models occasionally
2236/// leak into visible output when they start-then-abandon a tool call.
2237fn strip_xml_tool_call_artifacts(text: &str) -> String {
2238    // Tags to remove (both open and close forms, case-insensitive).
2239    const XML_ARTIFACTS: &[&str] = &[
2240        "</tool_call>",
2241        "<tool_call>",
2242        "</function>",
2243        "<function>",
2244        "</parameter>",
2245        "<parameter>",
2246        "</arguments>",
2247        "<arguments>",
2248        "</tool_use>",
2249        "<tool_use>",
2250        "</invoke>",
2251        "<invoke>",
2252        // Stray think/reasoning closing tags that leak after block extraction.
2253        "</think>",
2254        "</thought>",
2255        "</thinking>",
2256    ];
2257    let mut out = text.to_string();
2258    for tag in XML_ARTIFACTS {
2259        // Case-insensitive replace
2260        while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2261            out.drain(pos..pos + tag.len());
2262        }
2263    }
2264    // Collapse any blank lines left behind
2265    out
2266}
2267
2268/// Extract native Gemma-4 <|tool_call|> tags from text.
2269/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
2270pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2271    use regex::Regex;
2272    let mut results = Vec::new();
2273
2274    // Regex to find the tool call block
2275    // Formats supported:
2276    // <|tool_call|>call:func_name{args}<tool_call|>
2277    // <|tool_call>call:func_name{args}[END_TOOL_REQUEST]
2278    // <|tool_call>call:func_name{args}<tool_call|>
2279    let re_call = Regex::new(
2280        r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2281    ).unwrap();
2282    // Regex to find arguments inside the braces
2283    // Handles <|"|> wrappers and plain values
2284    let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2285
2286    for cap in re_call.captures_iter(text) {
2287        let name = cap[1].to_string();
2288        let args_str = &cap[2];
2289        let mut arguments = serde_json::Map::new();
2290
2291        for arg_cap in re_arg.captures_iter(args_str) {
2292            let key = arg_cap[1].to_string();
2293            // arg_cap[2] is the <|"|> wrapped value, arg_cap[3] is the plain value
2294            let val_raw = arg_cap
2295                .get(2)
2296                .map(|m| m.as_str())
2297                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2298                .unwrap_or("")
2299                .trim();
2300            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2301
2302            // Try to parse as JSON types (bool, number), otherwise string
2303            let val = if normalized_raw == "true" {
2304                Value::Bool(true)
2305            } else if normalized_raw == "false" {
2306                Value::Bool(false)
2307            } else if let Ok(n) = normalized_raw.parse::<i64>() {
2308                Value::Number(n.into())
2309            } else if let Ok(n) = normalized_raw.parse::<u64>() {
2310                Value::Number(n.into())
2311            } else if let Ok(n) = normalized_raw.parse::<f64>() {
2312                serde_json::Number::from_f64(n)
2313                    .map(Value::Number)
2314                    .unwrap_or(Value::String(normalized_raw.clone()))
2315            } else {
2316                Value::String(normalized_raw)
2317            };
2318
2319            arguments.insert(key, val);
2320        }
2321
2322        results.push(ToolCallResponse {
2323            id: format!("call_{}", rand::random::<u32>()),
2324            call_type: "function".to_string(),
2325            function: ToolCallFn {
2326                name,
2327                arguments: Value::Object(arguments).to_string(),
2328            },
2329        });
2330    }
2331
2332    results
2333}
2334
2335pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2336    let trimmed = raw.trim();
2337    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2338
2339    let mut value = match serde_json::from_str::<Value>(&candidate) {
2340        Ok(v) => v,
2341        Err(_) => return candidate,
2342    };
2343    normalize_tool_argument_value(tool_name, &mut value);
2344    value.to_string()
2345}
2346
2347fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2348    match value {
2349        Value::String(s) => *s = normalize_string_arg(s),
2350        Value::Array(items) => {
2351            for item in items {
2352                normalize_tool_argument_value(tool_name, item);
2353            }
2354        }
2355        Value::Object(map) => {
2356            for val in map.values_mut() {
2357                normalize_tool_argument_value(tool_name, val);
2358            }
2359            if tool_name == "grep_files" {
2360                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2361                    *pattern = normalize_regex_pattern(pattern);
2362                }
2363            }
2364            for key in ["path", "extension", "query", "command", "reason"] {
2365                if let Some(Value::String(s)) = map.get_mut(key) {
2366                    *s = normalize_string_arg(s);
2367                }
2368            }
2369        }
2370        _ => {}
2371    }
2372}
2373
2374fn unwrap_json_string_once(input: &str) -> Option<String> {
2375    if input.len() < 2 {
2376        return None;
2377    }
2378    let first = input.chars().next()?;
2379    let last = input.chars().last()?;
2380    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2381        return None;
2382    }
2383    let inner = &input[1..input.len() - 1];
2384    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2385    Some(unescaped.trim().to_string())
2386}
2387
2388fn normalize_string_arg(input: &str) -> String {
2389    let mut out = input.trim().to_string();
2390    while out.len() >= 2 {
2391        let mut changed = false;
2392        for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2393            if out.starts_with(start) && out.ends_with(end) {
2394                out = out[start.len()..out.len() - end.len()].trim().to_string();
2395                changed = true;
2396                break;
2397            }
2398        }
2399        if !changed {
2400            break;
2401        }
2402    }
2403    out
2404}
2405
2406fn normalize_regex_pattern(input: &str) -> String {
2407    let out = normalize_string_arg(input);
2408    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2409        out[1..out.len() - 1].to_string()
2410    } else {
2411        out
2412    }
2413}
2414
2415fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2416    let mut system_blocks = Vec::new();
2417    let mut prepared = Vec::new();
2418    let mut seeded = false;
2419
2420    for message in messages {
2421        if message.role == "system" {
2422            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2423                .trim()
2424                .to_string();
2425            if !cleaned.is_empty() {
2426                system_blocks.push(cleaned);
2427            }
2428            continue;
2429        }
2430
2431        let mut clone = message.clone();
2432        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2433
2434        if !seeded && message.role == "user" {
2435            let mut merged = String::new();
2436            if !system_blocks.is_empty() {
2437                merged.push_str("System instructions for this turn:\n");
2438                merged.push_str(&system_blocks.join("\n\n"));
2439                merged.push_str("\n\n");
2440            }
2441            merged.push_str(clone.content.as_str());
2442            clone.content = MessageContent::Text(merged);
2443            seeded = true;
2444        }
2445
2446        prepared.push(clone);
2447    }
2448
2449    if !seeded && !system_blocks.is_empty() {
2450        prepared.insert(
2451            0,
2452            ChatMessage::user(&format!(
2453                "System instructions for this turn:\n{}",
2454                system_blocks.join("\n\n")
2455            )),
2456        );
2457    }
2458
2459    prepared
2460}
2461
2462fn strip_legacy_turn_wrappers(text: &str) -> String {
2463    text.replace("<|turn>system\n", "")
2464        .replace("<|turn>user\n", "")
2465        .replace("<|turn>assistant\n", "")
2466        .replace("<|turn>tool\n", "")
2467        .replace("<turn|>", "")
2468        .trim()
2469        .to_string()
2470}
2471
2472pub fn strip_native_tool_call_text(text: &str) -> String {
2473    use regex::Regex;
2474    let re_call = Regex::new(
2475        r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2476    ).unwrap();
2477    let re_response =
2478        Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2479            .unwrap();
2480    let without_calls = re_call.replace_all(text, "");
2481    re_response
2482        .replace_all(without_calls.as_ref(), "")
2483        .trim()
2484        .to_string()
2485}
2486
2487#[cfg(test)]
2488mod tests {
2489    use super::*;
2490
2491    #[test]
2492    fn system_prompt_includes_running_hematite_version() {
2493        let engine = InferenceEngine::new(
2494            "http://localhost:1234/v1".to_string(),
2495            "strategist".to_string(),
2496            0,
2497        )
2498        .expect("engine");
2499
2500        let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2501        assert!(system.contains(crate::HEMATITE_VERSION));
2502    }
2503
2504    #[test]
2505    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2506        let text = r#"<|channel>thought
2507Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2508<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2509
2510        let calls = extract_native_tool_calls(text);
2511        assert_eq!(calls.len(), 1);
2512        assert_eq!(calls[0].function.name, "read_file");
2513
2514        let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2515        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2516        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2517        assert_eq!(
2518            args.get("path").and_then(|v| v.as_str()),
2519            Some("src/ui/tui.rs")
2520        );
2521
2522        let stripped = strip_native_tool_call_text(text);
2523        assert!(!stripped.contains("<|tool_call"));
2524        assert!(!stripped.contains("<tool_call|>"));
2525    }
2526
2527    #[test]
2528    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2529        let text = r#"<|channel>thought
2530Planning.
2531<channel|><|tool_call>call:map_project{focus:<|\"|>src/<|\"|>,include_symbols:true}<tool_call|><|tool_response>thought
2532Mapped src.
2533<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2534Read main.
2535<channel|>"#;
2536
2537        let calls = extract_native_tool_calls(text);
2538        assert_eq!(calls.len(), 2);
2539        assert_eq!(calls[0].function.name, "map_project");
2540        assert_eq!(calls[1].function.name, "read_file");
2541
2542        let stripped = strip_native_tool_call_text(text);
2543        assert!(!stripped.contains("<|tool_call"));
2544        assert!(!stripped.contains("<|tool_response"));
2545        assert!(!stripped.contains("<tool_response|>"));
2546    }
2547}
hematite/agent/inference.rs

hematite/agent/
inference.rs