hematite/agent/
inference.rs

1use serde::Serialize;
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6pub use crate::agent::types::*;
7
8// ── Engine ────────────────────────────────────────────────────────────────────
9
10pub struct InferenceEngine {
11    pub provider:
12        std::sync::Arc<tokio::sync::RwLock<Box<dyn crate::agent::provider::ModelProvider>>>,
13    pub cached_model: std::sync::Arc<std::sync::RwLock<String>>,
14    pub cached_context: std::sync::Arc<std::sync::atomic::AtomicUsize>,
15    pub base_url: String,
16    pub species: String,
17    pub snark: u8,
18    pub kv_semaphore: Semaphore,
19    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
20    /// Optional model ID for worker-level tasks (Swarms / research).
21    pub worker_model: Option<String>,
22    /// Opt-in Gemma-native request shaping. Off by default.
23    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
24    /// Global cancellation token for hard-interrupting the inference stream.
25    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
26}
27
28pub fn is_hematite_native_model(model: &str) -> bool {
29    let lower = model.to_ascii_lowercase();
30    lower.contains("gemma-4") || lower.contains("gemma4")
31}
32
33fn should_use_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
34    is_hematite_native_model(model) && engine.gemma_native_formatting_enabled()
35}
36
37// ── OpenAI Tool Definition ────────────────────────────────────────────────────
38
39pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
40    if name.starts_with("mcp__") {
41        let lower = name.to_ascii_lowercase();
42        let mutates_workspace = [
43            "__edit",
44            "__write",
45            "__create",
46            "__move",
47            "__delete",
48            "__remove",
49            "__rename",
50            "__replace",
51            "__patch",
52        ]
53        .iter()
54        .any(|needle| lower.contains(needle));
55        return ToolMetadata {
56            category: ToolCategory::External,
57            mutates_workspace,
58            external_surface: true,
59            trust_sensitive: true,
60            read_only_friendly: !mutates_workspace,
61            plan_scope: false,
62        };
63    }
64
65    match name {
66        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
67            category: ToolCategory::RepoRead,
68            mutates_workspace: false,
69            external_surface: false,
70            trust_sensitive: false,
71            read_only_friendly: true,
72            plan_scope: true,
73        },
74        "create_directory" | "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => {
75            ToolMetadata {
76                category: ToolCategory::RepoWrite,
77                mutates_workspace: true,
78                external_surface: false,
79                trust_sensitive: true,
80                read_only_friendly: false,
81                plan_scope: true,
82            }
83        }
84        "trace_runtime_flow" => ToolMetadata {
85            category: ToolCategory::Architecture,
86            mutates_workspace: false,
87            external_surface: false,
88            trust_sensitive: false,
89            read_only_friendly: true,
90            plan_scope: false,
91        },
92        "describe_toolchain" => ToolMetadata {
93            category: ToolCategory::Toolchain,
94            mutates_workspace: false,
95            external_surface: false,
96            trust_sensitive: false,
97            read_only_friendly: true,
98            plan_scope: false,
99        },
100        "shell" => ToolMetadata {
101            category: ToolCategory::Runtime,
102            mutates_workspace: true,
103            external_surface: false,
104            trust_sensitive: true,
105            read_only_friendly: false,
106            plan_scope: false,
107        },
108        "inspect_host" => ToolMetadata {
109            category: ToolCategory::Runtime,
110            mutates_workspace: false,
111            external_surface: false,
112            trust_sensitive: false,
113            read_only_friendly: true,
114            plan_scope: false,
115        },
116        "resolve_host_issue" => ToolMetadata {
117            category: ToolCategory::Runtime,
118            mutates_workspace: true,
119            external_surface: true,
120            trust_sensitive: true,
121            read_only_friendly: false,
122            plan_scope: false,
123        },
124        "run_hematite_maintainer_workflow" => ToolMetadata {
125            category: ToolCategory::Workflow,
126            mutates_workspace: true,
127            external_surface: false,
128            trust_sensitive: true,
129            read_only_friendly: false,
130            plan_scope: false,
131        },
132        "run_workspace_workflow" => ToolMetadata {
133            category: ToolCategory::Workflow,
134            mutates_workspace: true,
135            external_surface: false,
136            trust_sensitive: true,
137            read_only_friendly: false,
138            plan_scope: false,
139        },
140        "verify_build" => ToolMetadata {
141            category: ToolCategory::Verification,
142            mutates_workspace: false,
143            external_surface: false,
144            trust_sensitive: false,
145            read_only_friendly: true,
146            plan_scope: true,
147        },
148        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
149            ToolMetadata {
150                category: ToolCategory::Git,
151                mutates_workspace: true,
152                external_surface: false,
153                trust_sensitive: true,
154                read_only_friendly: false,
155                plan_scope: false,
156            }
157        }
158        "research_web" | "fetch_docs" => ToolMetadata {
159            category: ToolCategory::Research,
160            mutates_workspace: false,
161            external_surface: false,
162            trust_sensitive: false,
163            read_only_friendly: true,
164            plan_scope: false,
165        },
166        "vision_analyze" => ToolMetadata {
167            category: ToolCategory::Vision,
168            mutates_workspace: false,
169            external_surface: false,
170            trust_sensitive: false,
171            read_only_friendly: true,
172            plan_scope: false,
173        },
174        "lsp_definitions"
175        | "lsp_references"
176        | "lsp_hover"
177        | "lsp_rename_symbol"
178        | "lsp_get_diagnostics"
179        | "lsp_search_symbol" => ToolMetadata {
180            category: ToolCategory::Lsp,
181            mutates_workspace: false,
182            external_surface: false,
183            trust_sensitive: false,
184            read_only_friendly: true,
185            plan_scope: false,
186        },
187        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
188            category: ToolCategory::Workflow,
189            mutates_workspace: false,
190            external_surface: false,
191            trust_sensitive: false,
192            read_only_friendly: true,
193            plan_scope: true,
194        },
195        "manage_tasks" => ToolMetadata {
196            category: ToolCategory::Workflow,
197            mutates_workspace: false,
198            external_surface: false,
199            trust_sensitive: false,
200            read_only_friendly: true,
201            plan_scope: false,
202        },
203        _ => ToolMetadata {
204            category: ToolCategory::Other,
205            mutates_workspace: false,
206            external_surface: false,
207            trust_sensitive: false,
208            read_only_friendly: true,
209            plan_scope: false,
210        },
211    }
212}
213// ── Message types migrated to types.rs ────────────────────────────────────────
214
215// ── HTTP request / response shapes ───────────────────────────────────────────
216
217const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
218const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
219
220fn is_tiny_context_window(context_length: usize) -> bool {
221    context_length <= 8_192
222}
223
224fn is_compact_context_window(context_length: usize) -> bool {
225    context_length > 8_192 && context_length <= 49_152
226}
227
228pub fn is_compact_context_window_pub(context_length: usize) -> bool {
229    is_compact_context_window(context_length)
230}
231
232fn is_provider_context_limit_detail(lower: &str) -> bool {
233    (lower.contains("n_keep") && lower.contains("n_ctx"))
234        || lower.contains("context length")
235        || lower.contains("keep from the initial prompt")
236        || lower.contains("prompt is greater than the context length")
237        || lower.contains("exceeds the context window")
238}
239
240fn classify_runtime_failure_tag(detail: &str) -> &'static str {
241    let lower = detail.to_ascii_lowercase();
242    if lower.contains("context_window_blocked")
243        || lower.contains("context ceiling reached")
244        || lower.contains("exceeds the")
245        || is_provider_context_limit_detail(&lower)
246    {
247        "context_window"
248    } else if lower.contains("empty response from model")
249        || lower.contains("model returned an empty response")
250    {
251        "empty_model_response"
252    } else if lower.contains("action blocked:")
253        || lower.contains("access denied")
254        || lower.contains("declined by user")
255    {
256        "tool_policy_blocked"
257    } else {
258        "provider_degraded"
259    }
260}
261
262fn runtime_failure_guidance(tag: &str) -> &'static str {
263    match tag {
264        "context_window" => {
265            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
266        }
267        "empty_model_response" => {
268            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
269        }
270        "tool_policy_blocked" => {
271            "Stay inside the allowed workflow or switch modes before retrying."
272        }
273        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
274    }
275}
276
277fn format_runtime_failure_message(detail: &str) -> String {
278    let tag = classify_runtime_failure_tag(detail);
279    format!(
280        "[failure:{}] {} Detail: {}",
281        tag,
282        runtime_failure_guidance(tag),
283        detail.trim()
284    )
285}
286
287// ── Events pushed to the TUI (migrated to types.rs) ──────────────────────────
288
289// ── Engine implementation ─────────────────────────────────────────────────────
290
291impl InferenceEngine {
292    pub fn new(
293        api_url: String,
294        species: String,
295        snark: u8,
296    ) -> Result<Self, Box<dyn std::error::Error>> {
297        let client = reqwest::Client::builder()
298            .timeout(std::time::Duration::from_secs(180))
299            .build()?;
300
301        let base_url = {
302            let trimmed = api_url.trim_end_matches('/');
303            if let Some(scheme_end) = trimmed.find("://") {
304                let after_scheme = &trimmed[scheme_end + 3..];
305                if let Some(path_start) = after_scheme.find('/') {
306                    format!(
307                        "{}://{}",
308                        &trimmed[..scheme_end],
309                        &after_scheme[..path_start]
310                    )
311                } else {
312                    trimmed.to_string()
313                }
314            } else {
315                trimmed.to_string()
316            }
317        };
318
319        let api_url_full = if api_url.ends_with("/chat/completions") {
320            api_url
321        } else if api_url.ends_with("/") {
322            format!("{}chat/completions", api_url)
323        } else {
324            format!("{}/chat/completions", api_url)
325        };
326
327        let lms = crate::agent::lms::LmsHarness::new();
328        let ollama_harness = crate::agent::ollama::OllamaHarness::new(&base_url);
329
330        let provider = if base_url.contains("11434") {
331            Box::new(crate::agent::provider::OllamaProvider {
332                client: client.clone(),
333                base_url: base_url.clone(),
334                model: String::new(),
335                context_length: 8192,
336                embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
337                ollama: ollama_harness,
338            }) as Box<dyn crate::agent::provider::ModelProvider>
339        } else {
340            Box::new(crate::agent::provider::LmsProvider {
341                client: client.clone(),
342                api_url: api_url_full,
343                base_url: base_url.clone(),
344                model: String::new(),
345                context_length: 0,
346                lms,
347            }) as Box<dyn crate::agent::provider::ModelProvider>
348        };
349
350        Ok(Self {
351            provider: std::sync::Arc::new(tokio::sync::RwLock::new(provider)),
352            cached_model: std::sync::Arc::new(std::sync::RwLock::new(String::new())),
353            cached_context: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
354            base_url: base_url.clone(),
355            species: species.clone(),
356            snark,
357            kv_semaphore: Semaphore::new(3),
358            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
359            worker_model: None,
360            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
361            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
362        })
363    }
364
365    pub fn set_gemma_native_formatting(&self, enabled: bool) {
366        self.gemma_native_formatting
367            .store(enabled, std::sync::atomic::Ordering::SeqCst);
368    }
369
370    pub async fn health_check(&self) -> bool {
371        let p = self.provider.read().await;
372        p.health_check().await
373    }
374
375    pub async fn provider_name(&self) -> String {
376        let p = self.provider.read().await;
377        p.name().to_string()
378    }
379
380    pub async fn get_loaded_model(&self) -> Option<String> {
381        let p = self.provider.read().await;
382        match p.detect_model().await {
383            Ok(m) if m.is_empty() => Some("".to_string()),
384            Ok(m) => Some(m),
385            Err(_) => None,
386        }
387    }
388
389    pub async fn get_embedding_model(&self) -> Option<String> {
390        let p = self.provider.read().await;
391        p.get_embedding_model().await
392    }
393
394    pub async fn load_model(&self, model_id: &str) -> Result<(), String> {
395        let p = self.provider.read().await;
396        p.load_model(model_id).await
397    }
398
399    pub async fn load_model_with_context(
400        &self,
401        model_id: &str,
402        context_length: Option<usize>,
403    ) -> Result<(), String> {
404        let p = self.provider.read().await;
405        p.load_model_with_context(model_id, context_length).await
406    }
407
408    pub async fn load_embedding_model(&self, model_id: &str) -> Result<(), String> {
409        let p = self.provider.read().await;
410        p.load_embedding_model(model_id).await
411    }
412
413    pub async fn list_provider_models(
414        &self,
415        kind: crate::agent::provider::ProviderModelKind,
416        loaded_only: bool,
417    ) -> Result<Vec<String>, String> {
418        let p = self.provider.read().await;
419        p.list_models(kind, loaded_only).await
420    }
421
422    pub async fn unload_model(&self, model_id: Option<&str>, all: bool) -> Result<String, String> {
423        let p = self.provider.read().await;
424        p.unload_model(model_id, all).await
425    }
426
427    pub async fn unload_embedding_model(&self, model_id: Option<&str>) -> Result<String, String> {
428        let p = self.provider.read().await;
429        p.unload_embedding_model(model_id).await
430    }
431
432    pub async fn prewarm(&self) -> Result<(), String> {
433        let p = self.provider.read().await;
434        p.prewarm().await
435    }
436
437    pub async fn detect_context_length(&self) -> usize {
438        let p = self.provider.read().await;
439        p.detect_context_length().await
440    }
441
442    pub async fn set_runtime_profile(&self, model: &str, context_length: usize) {
443        if let Ok(mut guard) = self.cached_model.write() {
444            *guard = model.to_string();
445        }
446        self.cached_context
447            .store(context_length, std::sync::atomic::Ordering::SeqCst);
448
449        let mut p = self.provider.write().await;
450        p.set_runtime_profile(model, context_length);
451    }
452
453    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
454        let previous_model = self.current_model();
455        let previous_context = self.current_context_length();
456
457        let detected_model = match self.get_loaded_model().await {
458            Some(m) if !m.is_empty() => m,
459            Some(_) => "no model loaded".to_string(),
460            None => previous_model.clone(),
461        };
462
463        let detected_context = self.detect_context_length().await;
464        let effective_model = if detected_model.is_empty() {
465            previous_model.clone()
466        } else {
467            detected_model
468        };
469        let effective_context = resolve_runtime_context(
470            &previous_model,
471            previous_context,
472            &effective_model,
473            detected_context,
474        );
475
476        let changed = effective_model != previous_model || effective_context != previous_context;
477        if changed {
478            self.set_runtime_profile(&effective_model, effective_context)
479                .await;
480        }
481
482        Some((effective_model, effective_context, changed))
483    }
484
485    pub fn build_system_prompt(
486        &self,
487        snark: u8,
488        chaos: u8,
489        brief: bool,
490        professional: bool,
491        tools: &[ToolDefinition],
492        reasoning_history: Option<&str>,
493        environment_summary: Option<&str>,
494        mcp_tools: &[crate::agent::mcp::McpTool],
495    ) -> String {
496        let mut sys = self.build_system_prompt_legacy(
497            snark,
498            chaos,
499            brief,
500            professional,
501            tools,
502            reasoning_history,
503            environment_summary,
504        );
505
506        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
507            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
508            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
509            for tool in mcp_tools {
510                let description = tool
511                    .description
512                    .as_deref()
513                    .unwrap_or("No description provided.");
514                sys.push_str(&format!("- {}: {}\n", tool.name, description));
515            }
516        }
517
518        sys
519    }
520
521    pub fn build_system_prompt_legacy(
522        &self,
523        snark: u8,
524        _chaos: u8,
525        brief: bool,
526        professional: bool,
527        tools: &[ToolDefinition],
528        reasoning_history: Option<&str>,
529        environment_summary: Option<&str>,
530    ) -> String {
531        let current_context_length = self.current_context_length();
532        if is_tiny_context_window(current_context_length) {
533            return self.build_system_prompt_tiny(brief, professional);
534        }
535        if is_compact_context_window(current_context_length) {
536            return self.build_system_prompt_compact(brief, professional, tools);
537        }
538
539        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
540        let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
541                                     - You are Hematite, a local coding system working on the user's machine.\n\
542                                     - The running Hematite build is ");
543        sys.push_str(&crate::hematite_version_display());
544        sys.push_str(".\n\
545                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
546                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
547                                     - For simple questions, answer briefly in plain language.\n\
548                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
549                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
550                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
551                                     - Keep internal reasoning inside channel delimiters.\n\
552                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
553                                     <turn|>\n\n");
554
555        if let Some(history) = reasoning_history {
556            if !history.is_empty() {
557                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
558                sys.push_str(history);
559                sys.push_str("\n\n");
560            }
561        }
562
563        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
564        if brief {
565            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
566                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
567                          - Depth: Surface-level verification only.\n\n");
568        } else {
569            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
570                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
571                          - Depth: Full multi-step derivation required.\n\n");
572        }
573
574        // IDENTITY & ENVIRONMENT
575        let os = std::env::consts::OS;
576        if let Some(summary) = environment_summary {
577            sys.push_str("## HOST ENVIRONMENT\n");
578            sys.push_str(summary);
579            sys.push_str("\n\n");
580        }
581
582        if professional {
583            sys.push_str(&format!(
584                "You are Hematite, a local coding system running on {}. \
585                 The TUI is one interface layer, not your whole identity. \
586                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
587                 Skip filler and keep the focus on the work.\n",
588                os
589            ));
590        } else {
591            sys.push_str(&format!(
592                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
593                 The terminal UI is only one surface of the system. \
594                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
595                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
596                self.species, snark, os
597            ));
598        }
599
600        // Inject loaded model and context window so the model knows its own budget.
601        let current_model = self.current_model();
602        if !current_model.is_empty() {
603            sys.push_str(&format!(
604                "Loaded model: {} | Context window: {} tokens. \
605                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
606                current_model, current_context_length
607            ));
608            if is_hematite_native_model(&current_model) {
609                sys.push_str(
610                    "Sovereign native note: prefer exact tool JSON with no extra prose when calling tools. \
611                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
612                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
613                );
614            }
615        } else {
616            sys.push_str(&format!(
617                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
618                current_context_length
619            ));
620        }
621
622        // PROTOCOL & TOOLS
623        let shell_desc = if cfg!(target_os = "windows") {
624            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
625             - Use ONLY for builds, tests, or file migrations. \n\
626             - You MUST use the `powershell` tool directly. \n\
627             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
628        } else {
629            "[EXTERNAL SHELL]: `bash` (Unix).\n\
630             - Use ONLY for builds, tests, or file migrations. \n\
631             - NEVER wrap bash in other shells. \n\n"
632        };
633
634        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
635                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
636                      - These are the ONLY way to explore and modify code. \n\
637                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
638        sys.push_str(shell_desc);
639
640        // ANTI-LOOPING & SELF-AUDIT
641        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
642                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
643
644        // Consolidated: All directives are now handled by the authoritative prompt.rs builder.
645        sys.push_str("## TURN ADVISORY\n");
646        if brief {
647            sys.push_str("- BRIEF MODE: Respond with ONE concise sentence/block unless more code is required.\n");
648        }
649        sys.push_str("- INTERNAL REASONING: Plan your move in the thought channel first.\n");
650
651        // Scaffolding protocol — enforces build validation after project creation.
652        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
653            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
654            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
655            4. Fix all errors before declaring success.\n\n\
656            ## PRE-FLIGHT SCOPING PROTOCOL\n\
657            Before attempting any multi-file task or complex refactor:\n\
658            1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
659            2. Use `auto_pin_context` to keep those files in active context.\n\
660            3. Only then proceed to deeper edits or research.\n\n\
661            ## REFACTORING PROTOCOL\n\
662            When modifying existing code or renaming symbols:\n\
663            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
664            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
665            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
666
667        // Inject CLAUDE.md / instruction files from the project directory.
668        sys.push_str(&load_instruction_files());
669        sys.push_str(&load_agent_skill_catalog());
670
671        // Inject cross-session memories synthesized by DeepReflect.
672        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
673
674        // Native Gemma-4 Tool Declarations
675        if !tools.is_empty() {
676            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
677            for tool in tools {
678                let schema = serde_json::to_string(&tool.function.parameters)
679                    .unwrap_or_else(|_| "{}".to_string());
680                sys.push_str(&format!(
681                    "<|tool>declaration:{}{}{}<tool|>\n",
682                    tool.function.name, "{", schema
683                ));
684                sys.push_str(&format!("// {})\n", tool.function.description));
685            }
686        }
687
688        sys
689    }
690
691    fn build_system_prompt_compact(
692        &self,
693        brief: bool,
694        professional: bool,
695        tools: &[ToolDefinition],
696    ) -> String {
697        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
698        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
699        let current_model = self.current_model();
700        let current_context_length = self.current_context_length();
701        let os = std::env::consts::OS;
702
703        let mut sys = String::from("<|turn>system\n<|think|>\n");
704        sys.push_str(&format!(
705            "You are Hematite {}, a local coding harness working on the user's machine.\n",
706            crate::hematite_version_display()
707        ));
708        if professional {
709            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
710        } else {
711            sys.push_str(&format!(
712                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
713                self.species
714            ));
715        }
716        sys.push_str(&format!(
717            "Model: {} | Context: {} tokens. Keep turns focused.\n",
718            current_model, current_context_length
719        ));
720        if is_hematite_native_model(&current_model) {
721            sys.push_str(
722                "Sovereign native: use exact tool JSON. No extra prose in tool calls. \
723                 Raw regex patterns in grep_files, no slash delimiters.\n",
724            );
725        }
726        if cfg!(target_os = "windows") {
727            sys.push_str(&format!(
728                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
729                os
730            ));
731        } else {
732            sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
733        }
734        if brief {
735            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
736        }
737
738        sys.push_str(
739            "\nCORE RULES:\n\
740             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
741             - Verify after edits: run `verify_build` after code changes, before committing.\n\
742             - One tool at a time. Do not batch unrelated tool calls.\n\
743             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
744             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
745             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
746        );
747
748        if !tools.is_empty() {
749            sys.push_str("\n# AVAILABLE TOOLS\n");
750            for tool in tools {
751                let desc: String = tool.function.description.chars().take(120).collect();
752                sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
753            }
754        }
755
756        sys.push_str("<turn|>\n");
757        sys
758    }
759
760    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
761        let current_model = self.current_model();
762        let current_context_length = self.current_context_length();
763        let os = std::env::consts::OS;
764        let mut sys = format!(
765            "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
766            crate::hematite_version_display()
767        );
768        if professional {
769            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
770        } else {
771            sys.push_str(&format!(
772                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
773                self.species
774            ));
775        }
776        if !current_model.is_empty() {
777            sys.push_str(&format!(
778                "Loaded model: {} | Context window: {} tokens.\n",
779                current_model, current_context_length
780            ));
781        } else {
782            sys.push_str(&format!(
783                "Context window: {} tokens.\n",
784                current_context_length
785            ));
786        }
787        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
788        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
789        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
790        if cfg!(target_os = "windows") {
791            sys.push_str(&format!(
792                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
793                os
794            ));
795        } else {
796            sys.push_str(&format!(
797                "You are running on {}. Use the native Unix shell conventions.\n",
798                os
799            ));
800        }
801        if brief {
802            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
803        }
804        sys.push_str("<turn|>\n");
805        sys
806    }
807
808    pub fn current_model(&self) -> String {
809        self.cached_model
810            .read()
811            .map(|g| g.clone())
812            .unwrap_or_default()
813    }
814
815    pub fn current_context_length(&self) -> usize {
816        self.cached_context
817            .load(std::sync::atomic::Ordering::Relaxed)
818    }
819
820    pub fn is_compact_context_window(&self) -> bool {
821        let len = self.current_context_length();
822        len <= 16384
823    }
824
825    pub fn gemma_native_formatting_enabled(&self) -> bool {
826        self.gemma_native_formatting
827            .load(std::sync::atomic::Ordering::Relaxed)
828    }
829
830    pub async fn call_with_tools(
831        &self,
832        messages: &[ChatMessage],
833        tools: &[ToolDefinition],
834        // Override the model ID for this call. None = use the live runtime model.
835        model_override: Option<&str>,
836    ) -> Result<
837        (
838            Option<String>,
839            Option<Vec<ToolCallResponse>>,
840            Option<TokenUsage>,
841            Option<String>,
842        ),
843        String,
844    > {
845        let _permit = self
846            .kv_semaphore
847            .acquire()
848            .await
849            .map_err(|e| e.to_string())?;
850
851        let (res, model_name, prepared_messages) = {
852            let p = self.provider.read().await;
853            let model_name = model_override.unwrap_or(&p.current_model()).to_string();
854            let prepared_messages = if should_use_native_formatting(self, &model_name) {
855                prepare_gemma_native_messages(messages)
856            } else {
857                messages.to_vec()
858            };
859            if let Err(detail) = preflight_chat_request(
860                &model_name,
861                &prepared_messages,
862                tools,
863                self.current_context_length(),
864            ) {
865                return Err(format_runtime_failure_message(&detail));
866            }
867            let res = p
868                .call_with_tools(&prepared_messages, tools, model_override)
869                .await
870                .map_err(|e| format_runtime_failure_message(&e))?;
871            (res, model_name, prepared_messages)
872        };
873
874        if let Ok(mut econ) = self.economics.lock() {
875            econ.input_tokens += res.usage.prompt_tokens;
876            econ.output_tokens += res.usage.completion_tokens;
877        }
878
879        let mut content = res.content;
880        let mut tool_calls = res.tool_calls;
881
882        // Post-processing: Gemma 4 / thinking block extraction
883        if let Some(text) = &content {
884            if should_use_native_formatting(self, &model_name) {
885                let native_calls = extract_native_tool_calls(text);
886                if !native_calls.is_empty() {
887                    let mut existing = tool_calls.unwrap_or_default();
888                    existing.extend(native_calls);
889                    tool_calls = Some(existing);
890
891                    let stripped = strip_native_tool_call_text(text);
892                    content = if stripped.trim().is_empty() {
893                        None
894                    } else {
895                        Some(stripped)
896                    };
897                }
898            }
899        }
900
901        // Normalization: Tool arguments
902        if should_use_native_formatting(self, &model_name) {
903            if let Some(calls) = tool_calls.as_mut() {
904                for call in calls.iter_mut() {
905                    normalize_tool_argument_value(
906                        &call.function.name,
907                        &mut call.function.arguments,
908                    );
909                }
910            }
911        }
912
913        if should_use_native_formatting(self, &model_name)
914            && content.is_none()
915            && tool_calls.is_none()
916            && !prepared_messages.is_empty()
917        {
918            return Err(format_runtime_failure_message(
919                "model returned an empty response after native-format message preparation",
920            ));
921        }
922
923        Ok((content, tool_calls, Some(res.usage), res.finish_reason))
924    }
925
926    // ── Streaming call (used for plain-text responses) ────────────────────────
927
928    /// Stream a conversation (no tools). Emits Token/Done/Error events.
929    pub async fn stream_messages(
930        &self,
931        messages: &[ChatMessage],
932        tx: mpsc::Sender<InferenceEvent>,
933    ) -> Result<(), Box<dyn std::error::Error>> {
934        let provider = self.provider.read().await;
935        provider.stream(messages, tx).await
936    }
937
938    /// Single-turn streaming (legacy helper used by startup sequence).
939    pub async fn stream_generation(
940        &self,
941        prompt: &str,
942        snark: u8,
943        chaos: u8,
944        brief: bool,
945        professional: bool,
946        tx: mpsc::Sender<InferenceEvent>,
947    ) -> Result<(), Box<dyn std::error::Error>> {
948        let system =
949            self.build_system_prompt(snark, chaos, brief, professional, &[], None, None, &[]);
950        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
951        self.stream_messages(&messages, tx).await
952    }
953
954    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
955
956    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
957    pub async fn generate_task_worker(
958        &self,
959        prompt: &str,
960        professional: bool,
961    ) -> Result<String, String> {
962        let current_model = self.current_model();
963        let model = self
964            .worker_model
965            .as_deref()
966            .unwrap_or(current_model.as_str());
967        self.generate_task_with_model(prompt, 0.1, professional, model)
968            .await
969    }
970
971    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
972        self.generate_task_with_temp(prompt, 0.1, professional)
973            .await
974    }
975
976    pub async fn generate_task_with_temp(
977        &self,
978        prompt: &str,
979        temp: f32,
980        professional: bool,
981    ) -> Result<String, String> {
982        let current_model = self.current_model();
983        self.generate_task_with_model(prompt, temp, professional, &current_model)
984            .await
985    }
986
987    pub async fn generate_task_with_model(
988        &self,
989        prompt: &str,
990        _temp: f32,
991        professional: bool,
992        model: &str,
993    ) -> Result<String, String> {
994        let _permit = self
995            .kv_semaphore
996            .acquire()
997            .await
998            .map_err(|e| e.to_string())?;
999
1000        let system =
1001            self.build_system_prompt(self.snark, 50, false, professional, &[], None, None, &[]);
1002        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1003        if let Err(detail) =
1004            preflight_chat_request(model, &messages, &[], self.current_context_length())
1005        {
1006            return Err(format_runtime_failure_message(&detail));
1007        }
1008
1009        let p = self.provider.read().await;
1010        let res = p
1011            .call_with_tools(&messages, &[], Some(model))
1012            .await
1013            .map_err(|e| format_runtime_failure_message(&e))?;
1014
1015        res.content
1016            .ok_or_else(|| "Empty response from model".to_string())
1017    }
1018
1019    // ── History management ────────────────────────────────────────────────────
1020
1021    /// Prune middle turns when context grows too large, keeping system + recent N.
1022    #[allow(dead_code)]
1023    pub fn snip_history(
1024        &self,
1025        turns: &[ChatMessage],
1026        max_tokens_estimate: usize,
1027        keep_recent: usize,
1028    ) -> Vec<ChatMessage> {
1029        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
1030        if total_chars / 4 <= max_tokens_estimate {
1031            return turns.to_vec();
1032        }
1033        let keep = keep_recent.min(turns.len());
1034        let mut snipped = vec![turns[0].clone()];
1035        if turns.len() > keep + 1 {
1036            snipped.push(ChatMessage::system(&format!(
1037                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
1038                turns.len() - keep - 1
1039            )));
1040            snipped.extend_from_slice(&turns[turns.len() - keep..]);
1041        } else {
1042            snipped = turns.to_vec();
1043        }
1044        snipped
1045    }
1046}
1047
1048fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
1049    serde_json::to_vec(value)
1050        .ok()
1051        .map_or(0, |bytes| bytes.len() / 4 + 1)
1052}
1053
1054const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
1055
1056pub fn estimate_message_tokens(message: &ChatMessage) -> usize {
1057    let content_tokens = match &message.content {
1058        MessageContent::Text(s) => s.len() / 4 + 1,
1059        MessageContent::Parts(parts) => parts
1060            .iter()
1061            .map(|part| match part {
1062                ContentPart::Text { text } => text.len() / 4 + 1,
1063                // Image payloads are transported as data URLs, but their base64
1064                // length should not be treated like plain text context pressure.
1065                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
1066            })
1067            .sum(),
1068    };
1069    let tool_tokens: usize = message
1070        .tool_calls
1071        .iter()
1072        .flatten()
1073        .map(|call| (call.function.name.len() + call.function.arguments.to_string().len()) / 4 + 4)
1074        .sum();
1075    content_tokens + tool_tokens + 6
1076}
1077
1078pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
1079    messages.iter().map(estimate_message_tokens).sum()
1080}
1081
1082fn reserved_output_tokens(context_length: usize) -> usize {
1083    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
1084    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
1085}
1086
1087pub fn estimate_prompt_pressure(
1088    messages: &[ChatMessage],
1089    tools: &[ToolDefinition],
1090    context_length: usize,
1091) -> (usize, usize, usize, u8) {
1092    let estimated_input_tokens =
1093        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
1094    let reserved_output = reserved_output_tokens(context_length);
1095    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
1096    let percent = if context_length == 0 {
1097        0
1098    } else {
1099        ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
1100    };
1101    (
1102        estimated_input_tokens,
1103        reserved_output,
1104        estimated_total,
1105        percent,
1106    )
1107}
1108
1109fn preflight_chat_request(
1110    model: &str,
1111    messages: &[ChatMessage],
1112    tools: &[ToolDefinition],
1113    context_length: usize,
1114) -> Result<(), String> {
1115    let (estimated_input_tokens, reserved_output, estimated_total, _) =
1116        estimate_prompt_pressure(messages, tools, context_length);
1117
1118    if estimated_total > context_length {
1119        return Err(format!(
1120            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
1121            model, estimated_input_tokens, reserved_output, estimated_total, context_length
1122        ));
1123    }
1124
1125    Ok(())
1126}
1127
1128/// Walk from CWD up to 4 parent directories and collect project guidance files.
1129/// Looks for rule files plus optional skill guidance such as CLAUDE.md,
1130/// .hematite/rules.md, SKILLS.md, SKILL.md, and .hematite/instructions.md.
1131/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
1132fn load_instruction_files() -> String {
1133    use std::collections::hash_map::DefaultHasher;
1134    use std::collections::HashSet;
1135    use std::hash::{Hash, Hasher};
1136
1137    let Ok(cwd) = std::env::current_dir() else {
1138        return String::new();
1139    };
1140    let mut result = String::new();
1141    let mut seen: HashSet<u64> = HashSet::new();
1142    let mut total_chars: usize = 0;
1143    const MAX_TOTAL: usize = 12_000;
1144    const MAX_PER_FILE: usize = 4_000;
1145
1146    let mut dir = cwd.clone();
1147    for _ in 0..4 {
1148        for name in crate::agent::instructions::PROJECT_GUIDANCE_FILES {
1149            let path = crate::agent::instructions::resolve_guidance_path(&dir, name);
1150            if !path.exists() {
1151                continue;
1152            }
1153            let Ok(content) = std::fs::read_to_string(&path) else {
1154                continue;
1155            };
1156            if content.trim().is_empty() {
1157                continue;
1158            }
1159
1160            let mut hasher = DefaultHasher::new();
1161            content.hash(&mut hasher);
1162            let h = hasher.finish();
1163            if !seen.insert(h) {
1164                continue;
1165            }
1166
1167            let truncated = if content.len() > MAX_PER_FILE {
1168                format!("{}...[truncated]", &content[..MAX_PER_FILE])
1169            } else {
1170                content
1171            };
1172
1173            if total_chars + truncated.len() > MAX_TOTAL {
1174                break;
1175            }
1176            total_chars += truncated.len();
1177            result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
1178        }
1179        match dir.parent().map(|p| p.to_owned()) {
1180            Some(p) => dir = p,
1181            None => break,
1182        }
1183    }
1184
1185    if result.is_empty() {
1186        return String::new();
1187    }
1188    format!("\n\n# Project Instructions And Skills\n{}", result)
1189}
1190
1191fn load_agent_skill_catalog() -> String {
1192    let workspace_root = crate::tools::file_ops::workspace_root();
1193    let config = crate::agent::config::load_config();
1194    let discovery =
1195        crate::agent::instructions::discover_agent_skills(&workspace_root, &config.trust);
1196    crate::agent::instructions::render_skill_catalog(&discovery, 6_000)
1197        .map(|rendered| format!("\n\n{}", rendered))
1198        .unwrap_or_default()
1199}
1200
1201pub fn extract_think_block(text: &str) -> Option<String> {
1202    let lower = text.to_lowercase();
1203
1204    // Official Gemma-4 Native Tags
1205    let open_tag = "<|channel>thought";
1206    let close_tag = "<channel|>";
1207
1208    let start_pos = lower.find(open_tag)?;
1209    let content_start = start_pos + open_tag.len();
1210
1211    let close_pos = lower[content_start..]
1212        .find(close_tag)
1213        .map(|p| content_start + p)
1214        .unwrap_or(text.len());
1215
1216    let content = text[content_start..close_pos].trim();
1217    if content.is_empty() {
1218        None
1219    } else {
1220        Some(content.to_string())
1221    }
1222}
1223
1224pub fn strip_think_blocks(text: &str) -> String {
1225    // Fast-path: strip a stray </think> the model emits at the start when it skips
1226    // the opening tag (common with Qwen after tool calls). Strip it before the lower
1227    // allocation so it can't slip through any branch below.
1228    let text = {
1229        let t = text.trim_start();
1230        if t.to_lowercase().starts_with("</think>") {
1231            &t[8..]
1232        } else {
1233            text
1234        }
1235    };
1236
1237    let lower = text.to_lowercase();
1238
1239    // Use the official Gemma-4 closing tag — answer is everything after it.
1240    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
1241        let answer = text[end..]
1242            .replace("<|channel>thought", "")
1243            .replace("<channel|>", "");
1244        return answer.trim().replace("\n\n\n", "\n\n").to_string();
1245    }
1246
1247    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
1248    let first_open = [
1249        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
1250        lower.find("<think>"),
1251        lower.find("<thinking>"),
1252        lower.find("<thought>"),
1253        lower.find("<|think|>"),
1254    ]
1255    .iter()
1256    .filter_map(|&x| x)
1257    .min();
1258
1259    if let Some(start) = first_open {
1260        if start > 0 {
1261            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
1262        }
1263        return String::new();
1264    }
1265
1266    // If the model outputs 'naked' reasoning without tags:
1267    // Strip leading sentences like "The user asked..." or "I should present..."
1268    // if they appear before actual answer content.
1269    let naked_reasoning_phrases: &[&str] = &[
1270        "the user asked",
1271        "the user is asking",
1272        "the user wants",
1273        "i will structure",
1274        "i should provide",
1275        "i should give",
1276        "i should avoid",
1277        "i should note",
1278        "i should focus",
1279        "i should keep",
1280        "i should respond",
1281        "i should present",
1282        "i should display",
1283        "i should show",
1284        "i need to",
1285        "i can see from",
1286        "without being overly",
1287        "let me ",
1288        "necessary information in my identity",
1289        "was computed successfully",
1290        "computed successfully",
1291    ];
1292    let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
1293    if is_naked_reasoning {
1294        let lines: Vec<&str> = text.lines().collect();
1295        if !lines.is_empty() {
1296            // Skip leading lines that are themselves reasoning prose or blank.
1297            // Stop skipping at the first line that looks like real answer content.
1298            let mut start_idx = 0;
1299            for (i, line) in lines.iter().enumerate() {
1300                let l = line.to_lowercase();
1301                let is_reasoning_line =
1302                    naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
1303                if is_reasoning_line {
1304                    start_idx = i + 1;
1305                } else {
1306                    break;
1307                }
1308            }
1309            if start_idx < lines.len() {
1310                return lines[start_idx..]
1311                    .join("\n")
1312                    .trim()
1313                    .replace("\n\n\n", "\n\n")
1314                    .to_string();
1315            }
1316            // Entire response was reasoning prose — return empty.
1317            return String::new();
1318        }
1319    }
1320
1321    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
1322    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
1323    let cleaned = strip_xml_tool_call_artifacts(text);
1324    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
1325}
1326
1327/// Remove stray XML tool-call closing/opening tags that local models occasionally
1328/// leak into visible output when they start-then-abandon a tool call.
1329fn strip_xml_tool_call_artifacts(text: &str) -> String {
1330    // Tags to remove (both open and close forms, case-insensitive).
1331    const XML_ARTIFACTS: &[&str] = &[
1332        "</tool_call>",
1333        "<tool_call>",
1334        "</function>",
1335        "<function>",
1336        "</parameter>",
1337        "<parameter>",
1338        "</arguments>",
1339        "<arguments>",
1340        "</tool_use>",
1341        "<tool_use>",
1342        "</invoke>",
1343        "<invoke>",
1344        // Stray think/reasoning closing tags that leak after block extraction.
1345        "</think>",
1346        "<thinking>",
1347        "</thought>",
1348        "</thinking>",
1349    ];
1350    let mut out = text.to_string();
1351    for tag in XML_ARTIFACTS {
1352        // Case-insensitive replace
1353        while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
1354            out.drain(pos..pos + tag.len());
1355        }
1356    }
1357    // Collapse any blank lines left behind
1358    out
1359}
1360
1361/// Extract native Gemma-4 <|tool_call|> tags from text.
1362/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
1363pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
1364    use regex::Regex;
1365    let mut results = Vec::new();
1366
1367    // -- Format 1: Gemma 4 Native (call:name{args}) --
1368    let re_call = Regex::new(
1369        r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
1370    ).unwrap();
1371    let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
1372
1373    for cap in re_call.captures_iter(text) {
1374        let name = cap[1].to_string();
1375        let args_str = &cap[2];
1376        let mut arguments = serde_json::Map::new();
1377
1378        for arg_cap in re_arg.captures_iter(args_str) {
1379            let key = arg_cap[1].to_string();
1380            let val_raw = arg_cap
1381                .get(2)
1382                .map(|m| m.as_str())
1383                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
1384                .unwrap_or("")
1385                .trim();
1386            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
1387
1388            let val = if normalized_raw == "true" {
1389                Value::Bool(true)
1390            } else if normalized_raw == "false" {
1391                Value::Bool(false)
1392            } else if let Ok(n) = normalized_raw.parse::<i64>() {
1393                Value::Number(n.into())
1394            } else if let Ok(n) = normalized_raw.parse::<u64>() {
1395                Value::Number(n.into())
1396            } else if let Ok(n) = normalized_raw.parse::<f64>() {
1397                serde_json::Number::from_f64(n)
1398                    .map(Value::Number)
1399                    .unwrap_or(Value::String(normalized_raw.clone()))
1400            } else {
1401                Value::String(normalized_raw)
1402            };
1403
1404            arguments.insert(key, val);
1405        }
1406
1407        results.push(ToolCallResponse {
1408            id: format!("call_{}", rand::random::<u32>()),
1409            call_type: "function".to_string(),
1410            function: ToolCallFn {
1411                name,
1412                arguments: Value::Object(arguments),
1413            },
1414            index: None,
1415        });
1416    }
1417
1418    // -- Format 2: XML (Qwen/Claude style) --
1419    let re_xml_call = Regex::new(
1420        r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
1421    ).unwrap();
1422    let re_xml_param =
1423        Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
1424
1425    for cap in re_xml_call.captures_iter(text) {
1426        let name = cap[1].to_string();
1427        let body = &cap[2];
1428        let mut arguments = serde_json::Map::new();
1429
1430        for p_cap in re_xml_param.captures_iter(body) {
1431            let key = p_cap[1].to_string();
1432            let val_raw = p_cap[2].trim();
1433            let val = if val_raw == "true" {
1434                Value::Bool(true)
1435            } else if val_raw == "false" {
1436                Value::Bool(false)
1437            } else if let Ok(n) = val_raw.parse::<i64>() {
1438                Value::Number(n.into())
1439            } else if let Ok(n) = val_raw.parse::<u64>() {
1440                Value::Number(n.into())
1441            } else {
1442                Value::String(val_raw.to_string())
1443            };
1444            arguments.insert(key, val);
1445        }
1446
1447        results.push(ToolCallResponse {
1448            id: format!("call_{}", rand::random::<u32>()),
1449            call_type: "function".to_string(),
1450            function: ToolCallFn {
1451                name,
1452                arguments: Value::Object(arguments),
1453            },
1454            index: None,
1455        });
1456    }
1457
1458    // -- Format 3: shorthand XML wrapper (<tool_call>name(key="value")</tool_call>) --
1459    let re_short_call =
1460        Regex::new(r#"(?s)<tool_call>\s*([A-Za-z_][A-Za-z0-9_]*)\((.*?)\)\s*</tool_call>"#)
1461            .unwrap();
1462    let re_short_arg = Regex::new(
1463        r#"([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(?:"((?:\\.|[^"])*)"|'((?:\\.|[^'])*)'|([^,\)]+))"#,
1464    )
1465    .unwrap();
1466
1467    for cap in re_short_call.captures_iter(text) {
1468        let name = cap[1].to_string();
1469        let args_str = cap[2].trim();
1470        let mut arguments = serde_json::Map::new();
1471
1472        for arg_cap in re_short_arg.captures_iter(args_str) {
1473            let key = arg_cap[1].to_string();
1474            let val_raw = arg_cap
1475                .get(2)
1476                .or_else(|| arg_cap.get(3))
1477                .or_else(|| arg_cap.get(4))
1478                .map(|m| m.as_str())
1479                .unwrap_or("")
1480                .trim();
1481            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
1482
1483            let val = if normalized_raw == "true" {
1484                Value::Bool(true)
1485            } else if normalized_raw == "false" {
1486                Value::Bool(false)
1487            } else if let Ok(n) = normalized_raw.parse::<i64>() {
1488                Value::Number(n.into())
1489            } else if let Ok(n) = normalized_raw.parse::<u64>() {
1490                Value::Number(n.into())
1491            } else if let Ok(n) = normalized_raw.parse::<f64>() {
1492                serde_json::Number::from_f64(n)
1493                    .map(Value::Number)
1494                    .unwrap_or(Value::String(normalized_raw.clone()))
1495            } else {
1496                Value::String(normalized_raw)
1497            };
1498
1499            arguments.insert(key, val);
1500        }
1501
1502        results.push(ToolCallResponse {
1503            id: format!("call_{}", rand::random::<u32>()),
1504            call_type: "function".to_string(),
1505            function: ToolCallFn {
1506                name,
1507                arguments: Value::Object(arguments),
1508            },
1509            index: None,
1510        });
1511    }
1512
1513    results
1514}
1515
1516pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
1517    let trimmed = raw.trim();
1518    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
1519
1520    let mut value = match serde_json::from_str::<Value>(&candidate) {
1521        Ok(v) => v,
1522        Err(_) => return candidate,
1523    };
1524    normalize_tool_argument_value(tool_name, &mut value);
1525    value.to_string()
1526}
1527
1528pub fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
1529    match value {
1530        Value::String(s) => *s = normalize_string_arg(s),
1531        Value::Array(items) => {
1532            for item in items {
1533                normalize_tool_argument_value(tool_name, item);
1534            }
1535        }
1536        Value::Object(map) => {
1537            for val in map.values_mut() {
1538                normalize_tool_argument_value(tool_name, val);
1539            }
1540            if tool_name == "grep_files" {
1541                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
1542                    *pattern = normalize_regex_pattern(pattern);
1543                }
1544            }
1545            for key in ["path", "extension", "query", "command", "reason"] {
1546                if let Some(Value::String(s)) = map.get_mut(key) {
1547                    *s = normalize_string_arg(s);
1548                }
1549            }
1550        }
1551        _ => {}
1552    }
1553}
1554
1555fn unwrap_json_string_once(input: &str) -> Option<String> {
1556    if input.len() < 2 {
1557        return None;
1558    }
1559    let first = input.chars().next()?;
1560    let last = input.chars().last()?;
1561    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
1562        return None;
1563    }
1564    let inner = &input[1..input.len() - 1];
1565    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
1566    Some(unescaped.trim().to_string())
1567}
1568
1569fn normalize_string_arg(input: &str) -> String {
1570    let mut out = input.trim().to_string();
1571    while out.len() >= 2 {
1572        let mut changed = false;
1573        for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
1574            if out.starts_with(start) && out.ends_with(end) {
1575                out = out[start.len()..out.len() - end.len()].trim().to_string();
1576                changed = true;
1577                break;
1578            }
1579        }
1580        if !changed {
1581            break;
1582        }
1583    }
1584    out
1585}
1586
1587fn normalize_regex_pattern(input: &str) -> String {
1588    let out = normalize_string_arg(input);
1589    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
1590        out[1..out.len() - 1].to_string()
1591    } else {
1592        out
1593    }
1594}
1595
1596fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
1597    let mut system_blocks = Vec::new();
1598    let mut prepared = Vec::new();
1599    let mut seeded = false;
1600
1601    for message in messages {
1602        if message.role == "system" {
1603            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
1604                .trim()
1605                .to_string();
1606            if !cleaned.is_empty() {
1607                system_blocks.push(cleaned);
1608            }
1609            continue;
1610        }
1611
1612        let mut clone = message.clone();
1613        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
1614
1615        if !seeded && message.role == "user" {
1616            let mut merged = String::new();
1617            if !system_blocks.is_empty() {
1618                merged.push_str("System instructions for this turn:\n");
1619                merged.push_str(&system_blocks.join("\n\n"));
1620                merged.push_str("\n\n");
1621            }
1622            merged.push_str(clone.content.as_str());
1623            clone.content = MessageContent::Text(merged);
1624            seeded = true;
1625        }
1626
1627        prepared.push(clone);
1628    }
1629
1630    if !seeded && !system_blocks.is_empty() {
1631        prepared.insert(
1632            0,
1633            ChatMessage::user(&format!(
1634                "System instructions for this turn:\n{}",
1635                system_blocks.join("\n\n")
1636            )),
1637        );
1638    }
1639
1640    prepared
1641}
1642
1643fn strip_legacy_turn_wrappers(text: &str) -> String {
1644    text.replace("<|turn>system\n", "")
1645        .replace("<|turn>user\n", "")
1646        .replace("<|turn>assistant\n", "")
1647        .replace("<|turn>tool\n", "")
1648        .replace("<turn|>", "")
1649        .trim()
1650        .to_string()
1651}
1652
1653pub fn strip_native_tool_call_text(text: &str) -> String {
1654    use regex::Regex;
1655    // Format 1: Gemma 4 Native
1656    let re_call = Regex::new(
1657        r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
1658    ).unwrap();
1659    // Format 2: XML (Qwen/Claude style)
1660    let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
1661    // Format 3: shorthand XML wrapper
1662    let re_short =
1663        Regex::new(r#"(?s)<tool_call>\s*[A-Za-z_][A-Za-z0-9_]*\(.*?\)\s*</tool_call>"#).unwrap();
1664    let re_response =
1665        Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
1666            .unwrap();
1667    let without_calls = re_call.replace_all(text, "");
1668    let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
1669    let without_short = re_short.replace_all(without_xml.as_ref(), "");
1670    re_response
1671        .replace_all(without_short.as_ref(), "")
1672        .trim()
1673        .to_string()
1674}
1675
1676fn resolve_runtime_context(
1677    previous_model: &str,
1678    previous_context: usize,
1679    effective_model: &str,
1680    detected_context: usize,
1681) -> usize {
1682    if effective_model == "no model loaded" || effective_model.trim().is_empty() {
1683        0
1684    } else if detected_context > 0 {
1685        detected_context
1686    } else if effective_model == previous_model {
1687        previous_context
1688    } else {
1689        0
1690    }
1691}
1692
1693#[cfg(test)]
1694mod tests {
1695    use super::*;
1696    use std::fs;
1697
1698    #[test]
1699    fn system_prompt_includes_running_hematite_version() {
1700        let engine = InferenceEngine::new(
1701            "http://localhost:1234/v1".to_string(),
1702            "strategist".to_string(),
1703            0,
1704        )
1705        .expect("engine");
1706
1707        let system = engine.build_system_prompt(0, 50, false, true, &[], None, None, &[]);
1708        assert!(system.contains(crate::HEMATITE_VERSION));
1709    }
1710
1711    #[test]
1712    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
1713        let text = r#"<|channel>thought
1714Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
1715<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
1716
1717        let calls = extract_native_tool_calls(text);
1718        assert_eq!(calls.len(), 1);
1719        assert_eq!(calls[0].function.name, "read_file");
1720
1721        let args: Value = calls[0].function.arguments.clone();
1722        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
1723        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
1724        assert_eq!(
1725            args.get("path").and_then(|v| v.as_str()),
1726            Some("src/ui/tui.rs")
1727        );
1728
1729        let stripped = strip_native_tool_call_text(text);
1730        assert!(!stripped.contains("<|tool_call"));
1731        assert!(!stripped.contains("<tool_call|>"));
1732    }
1733
1734    #[test]
1735    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
1736        let text = r#"<|channel>thought
1737Planning.
1738<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
1739Mapped src.
1740<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
1741Read main.
1742<channel|>"#;
1743
1744        let calls = extract_native_tool_calls(text);
1745        assert_eq!(calls.len(), 2);
1746        assert_eq!(calls[0].function.name, "list_files");
1747        assert_eq!(calls[1].function.name, "read_file");
1748
1749        let stripped = strip_native_tool_call_text(text);
1750        assert!(!stripped.contains("<|tool_call"));
1751        assert!(!stripped.contains("<|tool_response"));
1752        assert!(!stripped.contains("<tool_response|>"));
1753    }
1754
1755    #[test]
1756    fn create_directory_is_treated_as_mutating_repo_write() {
1757        let metadata = tool_metadata_for_name("create_directory");
1758        assert!(metadata.mutates_workspace);
1759        assert!(!metadata.read_only_friendly);
1760    }
1761
1762    #[test]
1763    fn extracts_qwen_xml_tool_calls_from_reasoning() {
1764        let text = r#"Based on the project structure, I need to check the binary.
1765<tool_call>
1766<function=shell>
1767<parameter=command>
1768ls -la hematite.exe
1769</parameter>
1770<parameter=reason>
1771Check if the binary exists
1772</parameter>
1773</function>
1774</tool_call>"#;
1775
1776        let calls = extract_native_tool_calls(text);
1777        assert_eq!(calls.len(), 1);
1778        assert_eq!(calls[0].function.name, "shell");
1779
1780        let args: Value = calls[0].function.arguments.clone();
1781        assert_eq!(
1782            args.get("command").and_then(|v| v.as_str()),
1783            Some("ls -la hematite.exe")
1784        );
1785        assert_eq!(
1786            args.get("reason").and_then(|v| v.as_str()),
1787            Some("Check if the binary exists")
1788        );
1789
1790        let stripped = strip_native_tool_call_text(text);
1791        assert!(!stripped.contains("<tool_call>"));
1792        assert!(!stripped.contains("<function=shell>"));
1793    }
1794
1795    #[test]
1796    fn extracts_shorthand_tool_calls_from_reasoning() {
1797        let text = r#"<thinking>
1798The user wants a search first.
1799</thinking>
1800
1801I'll search before continuing.
1802
1803<tool_call>research_web(query="uefn toolbelt python automation unreal engine fortnite")</tool_call>"#;
1804
1805        let calls = extract_native_tool_calls(text);
1806        assert_eq!(calls.len(), 1);
1807        assert_eq!(calls[0].function.name, "research_web");
1808
1809        let args: Value = calls[0].function.arguments.clone();
1810        assert_eq!(
1811            args.get("query").and_then(|v| v.as_str()),
1812            Some("uefn toolbelt python automation unreal engine fortnite")
1813        );
1814
1815        let stripped = strip_native_tool_call_text(text);
1816        assert!(!stripped.contains("<tool_call>"));
1817        assert!(!stripped.contains("research_web(query="));
1818    }
1819
1820    #[test]
1821    fn strips_thinking_tag_as_reasoning_prefix() {
1822        let cleaned =
1823            strip_think_blocks("<thinking>\nThe user wants a search.\n</thinking>\nVisible answer");
1824        assert_eq!(cleaned, "");
1825    }
1826
1827    #[test]
1828    fn resolve_runtime_context_returns_zero_when_no_model_loaded() {
1829        assert_eq!(
1830            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "no model loaded", 0),
1831            0
1832        );
1833    }
1834
1835    #[test]
1836    fn resolve_runtime_context_preserves_previous_only_for_same_model() {
1837        assert_eq!(
1838            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "qwen/qwen3.5-9b", 0),
1839            32000
1840        );
1841        assert_eq!(
1842            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "bonsai-8b", 0),
1843            0
1844        );
1845    }
1846
1847    #[test]
1848    fn load_instruction_files_includes_workspace_guidance_files() {
1849        let temp = tempfile::tempdir().unwrap();
1850        let previous = std::env::current_dir().unwrap();
1851
1852        fs::write(
1853            temp.path().join("SKILLS.md"),
1854            "# Workspace Skills\n- Prefer API-first changes before UI polish.",
1855        )
1856        .unwrap();
1857
1858        std::env::set_current_dir(temp.path()).unwrap();
1859        let loaded = load_instruction_files();
1860        std::env::set_current_dir(previous).unwrap();
1861
1862        assert!(loaded.contains("SKILLS.md"));
1863        assert!(loaded.contains("Prefer API-first changes before UI polish."));
1864    }
1865
1866    #[test]
1867    fn load_agent_skill_catalog_includes_skill_directory_entries() {
1868        let temp = tempfile::tempdir().unwrap();
1869        let previous = std::env::current_dir().unwrap();
1870
1871        std::fs::create_dir_all(temp.path().join(".agents/skills/code-review")).unwrap();
1872        fs::write(
1873            temp.path().join(".agents/skills/code-review/SKILL.md"),
1874            "---\nname: code-review\ndescription: Review diffs and flag regressions.\ncompatibility: Requires git\n---\n",
1875        )
1876        .unwrap();
1877
1878        std::env::set_current_dir(temp.path()).unwrap();
1879        let loaded = load_agent_skill_catalog();
1880        std::env::set_current_dir(previous).unwrap();
1881
1882        assert!(loaded.contains("Agent Skills Catalog"));
1883        assert!(loaded.contains("code-review"));
1884        assert!(loaded.contains("Review diffs and flag regressions."));
1885    }
1886}
hematite/agent/inference.rs

hematite/agent/
inference.rs