hematite/agent/
inference.rs

1use serde::Serialize;
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6pub use crate::agent::types::*;
7
8// ── Engine ────────────────────────────────────────────────────────────────────
9
10pub struct InferenceEngine {
11    pub provider:
12        std::sync::Arc<tokio::sync::RwLock<Box<dyn crate::agent::provider::ModelProvider>>>,
13    pub cached_model: std::sync::Arc<std::sync::RwLock<String>>,
14    pub cached_context: std::sync::Arc<std::sync::atomic::AtomicUsize>,
15    pub base_url: String,
16    pub species: String,
17    pub snark: u8,
18    pub kv_semaphore: Semaphore,
19    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
20    /// Optional model ID for worker-level tasks (Swarms / research).
21    pub worker_model: Option<String>,
22    /// Opt-in Gemma-native request shaping. Off by default.
23    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
24    /// Global cancellation token for hard-interrupting the inference stream.
25    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
26}
27
28pub fn is_hematite_native_model(model: &str) -> bool {
29    let lower = model.to_ascii_lowercase();
30    lower.contains("gemma-4") || lower.contains("gemma4")
31}
32
33fn should_use_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
34    is_hematite_native_model(model) && engine.gemma_native_formatting_enabled()
35}
36
37// ── OpenAI Tool Definition ────────────────────────────────────────────────────
38
39pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
40    if name.starts_with("mcp__") {
41        let lower = name.to_ascii_lowercase();
42        let mutates_workspace = [
43            "__edit",
44            "__write",
45            "__create",
46            "__move",
47            "__delete",
48            "__remove",
49            "__rename",
50            "__replace",
51            "__patch",
52        ]
53        .iter()
54        .any(|needle| lower.contains(needle));
55        return ToolMetadata {
56            category: ToolCategory::External,
57            mutates_workspace,
58            external_surface: true,
59            trust_sensitive: true,
60            read_only_friendly: !mutates_workspace,
61            plan_scope: false,
62        };
63    }
64
65    match name {
66        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
67            category: ToolCategory::RepoRead,
68            mutates_workspace: false,
69            external_surface: false,
70            trust_sensitive: false,
71            read_only_friendly: true,
72            plan_scope: true,
73        },
74        "create_directory" | "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => {
75            ToolMetadata {
76                category: ToolCategory::RepoWrite,
77                mutates_workspace: true,
78                external_surface: false,
79                trust_sensitive: true,
80                read_only_friendly: false,
81                plan_scope: true,
82            }
83        }
84        "trace_runtime_flow" => ToolMetadata {
85            category: ToolCategory::Architecture,
86            mutates_workspace: false,
87            external_surface: false,
88            trust_sensitive: false,
89            read_only_friendly: true,
90            plan_scope: false,
91        },
92        "describe_toolchain" => ToolMetadata {
93            category: ToolCategory::Toolchain,
94            mutates_workspace: false,
95            external_surface: false,
96            trust_sensitive: false,
97            read_only_friendly: true,
98            plan_scope: false,
99        },
100        "shell" => ToolMetadata {
101            category: ToolCategory::Runtime,
102            mutates_workspace: true,
103            external_surface: false,
104            trust_sensitive: true,
105            read_only_friendly: false,
106            plan_scope: false,
107        },
108        "inspect_host" => ToolMetadata {
109            category: ToolCategory::Runtime,
110            mutates_workspace: false,
111            external_surface: false,
112            trust_sensitive: false,
113            read_only_friendly: true,
114            plan_scope: false,
115        },
116        "resolve_host_issue" => ToolMetadata {
117            category: ToolCategory::Runtime,
118            mutates_workspace: true,
119            external_surface: true,
120            trust_sensitive: true,
121            read_only_friendly: false,
122            plan_scope: false,
123        },
124        "run_hematite_maintainer_workflow" => ToolMetadata {
125            category: ToolCategory::Workflow,
126            mutates_workspace: true,
127            external_surface: false,
128            trust_sensitive: true,
129            read_only_friendly: false,
130            plan_scope: false,
131        },
132        "run_workspace_workflow" => ToolMetadata {
133            category: ToolCategory::Workflow,
134            mutates_workspace: true,
135            external_surface: false,
136            trust_sensitive: true,
137            read_only_friendly: false,
138            plan_scope: false,
139        },
140        "verify_build" => ToolMetadata {
141            category: ToolCategory::Verification,
142            mutates_workspace: false,
143            external_surface: false,
144            trust_sensitive: false,
145            read_only_friendly: true,
146            plan_scope: true,
147        },
148        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
149            ToolMetadata {
150                category: ToolCategory::Git,
151                mutates_workspace: true,
152                external_surface: false,
153                trust_sensitive: true,
154                read_only_friendly: false,
155                plan_scope: false,
156            }
157        }
158        "research_web" | "fetch_docs" => ToolMetadata {
159            category: ToolCategory::Research,
160            mutates_workspace: false,
161            external_surface: false,
162            trust_sensitive: false,
163            read_only_friendly: true,
164            plan_scope: false,
165        },
166        "vision_analyze" => ToolMetadata {
167            category: ToolCategory::Vision,
168            mutates_workspace: false,
169            external_surface: false,
170            trust_sensitive: false,
171            read_only_friendly: true,
172            plan_scope: false,
173        },
174        "lsp_definitions"
175        | "lsp_references"
176        | "lsp_hover"
177        | "lsp_rename_symbol"
178        | "lsp_get_diagnostics"
179        | "lsp_search_symbol" => ToolMetadata {
180            category: ToolCategory::Lsp,
181            mutates_workspace: false,
182            external_surface: false,
183            trust_sensitive: false,
184            read_only_friendly: true,
185            plan_scope: false,
186        },
187        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
188            category: ToolCategory::Workflow,
189            mutates_workspace: false,
190            external_surface: false,
191            trust_sensitive: false,
192            read_only_friendly: true,
193            plan_scope: true,
194        },
195        "manage_tasks" => ToolMetadata {
196            category: ToolCategory::Workflow,
197            mutates_workspace: false,
198            external_surface: false,
199            trust_sensitive: false,
200            read_only_friendly: true,
201            plan_scope: false,
202        },
203        _ => ToolMetadata {
204            category: ToolCategory::Other,
205            mutates_workspace: false,
206            external_surface: false,
207            trust_sensitive: false,
208            read_only_friendly: true,
209            plan_scope: false,
210        },
211    }
212}
213// ── Message types migrated to types.rs ────────────────────────────────────────
214
215// ── HTTP request / response shapes ───────────────────────────────────────────
216
217const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
218const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
219
220fn is_tiny_context_window(context_length: usize) -> bool {
221    context_length <= 8_192
222}
223
224fn is_compact_context_window(context_length: usize) -> bool {
225    context_length > 8_192 && context_length <= 49_152
226}
227
228pub fn is_compact_context_window_pub(context_length: usize) -> bool {
229    is_compact_context_window(context_length)
230}
231
232fn is_provider_context_limit_detail(lower: &str) -> bool {
233    (lower.contains("n_keep") && lower.contains("n_ctx"))
234        || lower.contains("context length")
235        || lower.contains("keep from the initial prompt")
236        || lower.contains("prompt is greater than the context length")
237        || lower.contains("exceeds the context window")
238}
239
240fn classify_runtime_failure_tag(detail: &str) -> &'static str {
241    let lower = detail.to_ascii_lowercase();
242    if lower.contains("context_window_blocked")
243        || lower.contains("context ceiling reached")
244        || lower.contains("exceeds the")
245        || is_provider_context_limit_detail(&lower)
246    {
247        "context_window"
248    } else if lower.contains("empty response from model")
249        || lower.contains("model returned an empty response")
250    {
251        "empty_model_response"
252    } else if lower.contains("action blocked:")
253        || lower.contains("access denied")
254        || lower.contains("declined by user")
255    {
256        "tool_policy_blocked"
257    } else {
258        "provider_degraded"
259    }
260}
261
262fn runtime_failure_guidance(tag: &str) -> &'static str {
263    match tag {
264        "context_window" => {
265            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
266        }
267        "empty_model_response" => {
268            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
269        }
270        "tool_policy_blocked" => {
271            "Stay inside the allowed workflow or switch modes before retrying."
272        }
273        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
274    }
275}
276
277fn format_runtime_failure_message(detail: &str) -> String {
278    let tag = classify_runtime_failure_tag(detail);
279    format!(
280        "[failure:{}] {} Detail: {}",
281        tag,
282        runtime_failure_guidance(tag),
283        detail.trim()
284    )
285}
286
287// ── Events pushed to the TUI (migrated to types.rs) ──────────────────────────
288
289// ── Engine implementation ─────────────────────────────────────────────────────
290
291impl InferenceEngine {
292    pub fn new(
293        api_url: String,
294        species: String,
295        snark: u8,
296    ) -> Result<Self, Box<dyn std::error::Error>> {
297        let client = reqwest::Client::builder()
298            .timeout(std::time::Duration::from_secs(180))
299            .build()?;
300
301        let base_url = {
302            let trimmed = api_url.trim_end_matches('/');
303            if let Some(scheme_end) = trimmed.find("://") {
304                let after_scheme = &trimmed[scheme_end + 3..];
305                if let Some(path_start) = after_scheme.find('/') {
306                    format!(
307                        "{}://{}",
308                        &trimmed[..scheme_end],
309                        &after_scheme[..path_start]
310                    )
311                } else {
312                    trimmed.to_string()
313                }
314            } else {
315                trimmed.to_string()
316            }
317        };
318
319        let api_url_full = if api_url.ends_with("/chat/completions") {
320            api_url
321        } else if api_url.ends_with("/") {
322            format!("{}chat/completions", api_url)
323        } else {
324            format!("{}/chat/completions", api_url)
325        };
326
327        let lms = crate::agent::lms::LmsHarness::new();
328        let ollama_harness = crate::agent::ollama::OllamaHarness::new(&base_url);
329
330        let provider = if base_url.contains("11434") {
331            Box::new(crate::agent::provider::OllamaProvider {
332                client: client.clone(),
333                base_url: base_url.clone(),
334                model: String::new(),
335                context_length: 8192,
336                embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
337                ollama: ollama_harness,
338            }) as Box<dyn crate::agent::provider::ModelProvider>
339        } else {
340            Box::new(crate::agent::provider::LmsProvider {
341                client: client.clone(),
342                api_url: api_url_full,
343                base_url: base_url.clone(),
344                model: String::new(),
345                context_length: 0,
346                lms,
347            }) as Box<dyn crate::agent::provider::ModelProvider>
348        };
349
350        Ok(Self {
351            provider: std::sync::Arc::new(tokio::sync::RwLock::new(provider)),
352            cached_model: std::sync::Arc::new(std::sync::RwLock::new(String::new())),
353            cached_context: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
354            base_url: base_url.clone(),
355            species: species.clone(),
356            snark,
357            kv_semaphore: Semaphore::new(3),
358            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
359            worker_model: None,
360            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
361            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
362        })
363    }
364
365    pub fn set_gemma_native_formatting(&self, enabled: bool) {
366        self.gemma_native_formatting
367            .store(enabled, std::sync::atomic::Ordering::SeqCst);
368    }
369
370    pub async fn health_check(&self) -> bool {
371        let p = self.provider.read().await;
372        p.health_check().await
373    }
374
375    pub async fn provider_name(&self) -> String {
376        let p = self.provider.read().await;
377        p.name().to_string()
378    }
379
380    pub async fn get_loaded_model(&self) -> Option<String> {
381        let p = self.provider.read().await;
382        match p.detect_model().await {
383            Ok(m) if m.is_empty() => Some("".to_string()),
384            Ok(m) => Some(m),
385            Err(_) => None,
386        }
387    }
388
389    pub async fn get_embedding_model(&self) -> Option<String> {
390        let p = self.provider.read().await;
391        p.get_embedding_model().await
392    }
393
394    pub async fn load_model(&self, model_id: &str) -> Result<(), String> {
395        let p = self.provider.read().await;
396        p.load_model(model_id).await
397    }
398
399    pub async fn load_model_with_context(
400        &self,
401        model_id: &str,
402        context_length: Option<usize>,
403    ) -> Result<(), String> {
404        let p = self.provider.read().await;
405        p.load_model_with_context(model_id, context_length).await
406    }
407
408    pub async fn load_embedding_model(&self, model_id: &str) -> Result<(), String> {
409        let p = self.provider.read().await;
410        p.load_embedding_model(model_id).await
411    }
412
413    pub async fn list_provider_models(
414        &self,
415        kind: crate::agent::provider::ProviderModelKind,
416        loaded_only: bool,
417    ) -> Result<Vec<String>, String> {
418        let p = self.provider.read().await;
419        p.list_models(kind, loaded_only).await
420    }
421
422    pub async fn unload_model(&self, model_id: Option<&str>, all: bool) -> Result<String, String> {
423        let p = self.provider.read().await;
424        p.unload_model(model_id, all).await
425    }
426
427    pub async fn unload_embedding_model(&self, model_id: Option<&str>) -> Result<String, String> {
428        let p = self.provider.read().await;
429        p.unload_embedding_model(model_id).await
430    }
431
432    pub async fn prewarm(&self) -> Result<(), String> {
433        let p = self.provider.read().await;
434        p.prewarm().await
435    }
436
437    pub async fn detect_context_length(&self) -> usize {
438        let p = self.provider.read().await;
439        p.detect_context_length().await
440    }
441
442    pub async fn set_runtime_profile(&self, model: &str, context_length: usize) {
443        if let Ok(mut guard) = self.cached_model.write() {
444            *guard = model.to_string();
445        }
446        self.cached_context
447            .store(context_length, std::sync::atomic::Ordering::SeqCst);
448
449        let mut p = self.provider.write().await;
450        p.set_runtime_profile(model, context_length);
451    }
452
453    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
454        let previous_model = self.current_model();
455        let previous_context = self.current_context_length();
456
457        let detected_model = match self.get_loaded_model().await {
458            Some(m) if !m.is_empty() => m,
459            Some(_) => "no model loaded".to_string(),
460            None => previous_model.clone(),
461        };
462
463        let detected_context = self.detect_context_length().await;
464        let effective_model = if detected_model.is_empty() {
465            previous_model.clone()
466        } else {
467            detected_model
468        };
469        let effective_context = resolve_runtime_context(
470            &previous_model,
471            previous_context,
472            &effective_model,
473            detected_context,
474        );
475
476        let changed = effective_model != previous_model || effective_context != previous_context;
477        if changed {
478            self.set_runtime_profile(&effective_model, effective_context)
479                .await;
480        }
481
482        Some((effective_model, effective_context, changed))
483    }
484
485    pub fn build_system_prompt(
486        &self,
487        snark: u8,
488        chaos: u8,
489        brief: bool,
490        professional: bool,
491        tools: &[ToolDefinition],
492        reasoning_history: Option<&str>,
493        environment_summary: Option<&str>,
494        mcp_tools: &[crate::agent::mcp::McpTool],
495    ) -> String {
496        let mut sys = self.build_system_prompt_legacy(
497            snark,
498            chaos,
499            brief,
500            professional,
501            tools,
502            reasoning_history,
503            environment_summary,
504        );
505
506        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
507            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
508            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
509            for tool in mcp_tools {
510                let description = tool
511                    .description
512                    .as_deref()
513                    .unwrap_or("No description provided.");
514                sys.push_str(&format!("- {}: {}\n", tool.name, description));
515            }
516        }
517
518        sys
519    }
520
521    pub fn build_system_prompt_legacy(
522        &self,
523        snark: u8,
524        _chaos: u8,
525        brief: bool,
526        professional: bool,
527        tools: &[ToolDefinition],
528        reasoning_history: Option<&str>,
529        environment_summary: Option<&str>,
530    ) -> String {
531        let current_context_length = self.current_context_length();
532        if is_tiny_context_window(current_context_length) {
533            return self.build_system_prompt_tiny(brief, professional);
534        }
535        if is_compact_context_window(current_context_length) {
536            return self.build_system_prompt_compact(brief, professional, tools);
537        }
538
539        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
540        let mut sys = String::from("## HEMATITE OPERATING PROTOCOL\n\
541                                     - You are Hematite, a local coding system working on the user's machine.\n\
542                                     - The running Hematite build is ");
543        sys.push_str(&crate::hematite_version_display());
544        sys.push_str(".\n\
545                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
546                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
547                                     - For simple questions, answer briefly in plain language.\n\
548                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
549                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
550                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
551                                     - Keep internal reasoning inside channel delimiters.\n\
552                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\n");
553
554        if let Some(history) = reasoning_history {
555            if !history.is_empty() {
556                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
557                sys.push_str(history);
558                sys.push_str("\n\n");
559            }
560        }
561
562        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
563        if brief {
564            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
565                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
566                          - Depth: Surface-level verification only.\n\n");
567        } else {
568            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
569                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
570                          - Depth: Full multi-step derivation required.\n\n");
571        }
572
573        // IDENTITY & ENVIRONMENT
574        let os = std::env::consts::OS;
575        if let Some(summary) = environment_summary {
576            sys.push_str("## HOST ENVIRONMENT\n");
577            sys.push_str(summary);
578            sys.push_str("\n\n");
579        }
580
581        if professional {
582            sys.push_str(&format!(
583                "You are Hematite, a local coding system running on {}. \
584                 The TUI is one interface layer, not your whole identity. \
585                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
586                 Skip filler and keep the focus on the work.\n",
587                os
588            ));
589        } else {
590            sys.push_str(&format!(
591                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
592                 The terminal UI is only one surface of the system. \
593                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
594                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
595                self.species, snark, os
596            ));
597        }
598
599        // Inject loaded model and context window so the model knows its own budget.
600        let current_model = self.current_model();
601        if !current_model.is_empty() {
602            sys.push_str(&format!(
603                "Loaded model: {} | Context window: {} tokens. \
604                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
605                current_model, current_context_length
606            ));
607            if is_hematite_native_model(&current_model) {
608                sys.push_str(
609                    "Sovereign native note: prefer exact tool JSON with no extra prose when calling tools. \
610                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
611                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
612                );
613            }
614        } else {
615            sys.push_str(&format!(
616                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
617                current_context_length
618            ));
619        }
620
621        // PROTOCOL & TOOLS
622        let shell_desc = if cfg!(target_os = "windows") {
623            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
624             - Use ONLY for builds, tests, or file migrations. \n\
625             - You MUST use the `powershell` tool directly. \n\
626             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
627        } else {
628            "[EXTERNAL SHELL]: `bash` (Unix).\n\
629             - Use ONLY for builds, tests, or file migrations. \n\
630             - NEVER wrap bash in other shells. \n\n"
631        };
632
633        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
634                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
635                      - These are the ONLY way to explore and modify code. \n\
636                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
637        sys.push_str(shell_desc);
638
639        // ANTI-LOOPING & SELF-AUDIT
640        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
641                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
642
643        sys.push_str("## THE COMPUTATIONAL RESEARCH MANDATE\n\
644                      - You are a Lead Computational Researcher and Senior Scientist.\n\
645                      - ZERO-TRUST MATH: You never guess results for math, physics, or algorithmic complexity.\n\
646                      - UNIT-SAFETY: All physical calculations must use `scientific_compute(mode='units')` to ensure dimensional consistency.\n\
647                      - SYMBOLIC PROOF: Use `scientific_compute(mode='symbolic')` for formal algebraic derivations and multi-variable proofs. Set `latex: true` for formal presentation.\n\
648                      - EMPIRICAL AUDITING: All algorithmic performance claims must be verified with `scientific_compute(mode='complexity')` before being finalized.\n\
649                      - SCIENTIFIC MEMORY (LEDGER): Use `scientific_compute(mode='ledger')` to persist long-form derivations, constants, and theorem steps to `.hematite/docs/scientific_ledger.md`. This ledger is RAG-indexed by The Vein, giving you persistent cross-session memory for project math.\n\
650                      - DATASET COMPUTATION: Use `scientific_compute(mode='dataset')` to perform high-precision calculations on SQL results (CSV/DB/JSON). This bridges data science and formal research.\n\
651                      - LIGHTWEIGHT SANDBOX: Prioritize pure Python implementations for all research tasks. Do NOT attempt to import heavy external libraries like 'numpy', 'scipy', or 'pandas' unless you have verified they are available or the user explicitly asks to work in a specific heavy environment or venv.\n\
652                      - Every result must be backed by the executable logic used to prove it.\n\n");
653
654        // Consolidated: All directives are now handled by the authoritative prompt.rs builder.
655        sys.push_str("## TURN ADVISORY\n");
656        if brief {
657            sys.push_str("- BRIEF MODE: Respond with ONE concise sentence/block unless more code is required.\n");
658        }
659        sys.push_str("- INTERNAL REASONING: Plan your move in the thought channel first.\n");
660
661        // Scaffolding protocol — enforces build validation after project creation.
662        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
663            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
664            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
665            4. Fix all errors before declaring success.\n\n\
666            ## PRE-FLIGHT SCOPING PROTOCOL\n\
667            Before attempting any multi-file task or complex refactor:\n\
668            1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
669            2. Use `auto_pin_context` to keep those files in active context.\n\
670            3. Only then proceed to deeper edits or research.\n\n\
671            ## REFACTORING PROTOCOL\n\
672            When modifying existing code or renaming symbols:\n\
673            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
674            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
675            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
676
677        // Inject CLAUDE.md / instruction files from the project directory.
678        sys.push_str(&load_instruction_files());
679        sys.push_str(&load_agent_skill_catalog());
680
681        // Inject cross-session memories synthesized by DeepReflect.
682        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
683
684        // Native Gemma-4 Tool Declarations
685        if !tools.is_empty() {
686            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
687            for tool in tools {
688                let schema = serde_json::to_string(&tool.function.parameters)
689                    .unwrap_or_else(|_| "{}".to_string());
690                sys.push_str(&format!(
691                    "<|tool>declaration:{}{}{}<tool|>\n",
692                    tool.function.name, "{", schema
693                ));
694                sys.push_str(&format!("// {})\n", tool.function.description));
695            }
696        }
697
698        sys
699    }
700
701    fn build_system_prompt_compact(
702        &self,
703        brief: bool,
704        professional: bool,
705        tools: &[ToolDefinition],
706    ) -> String {
707        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
708        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
709        let current_model = self.current_model();
710        let current_context_length = self.current_context_length();
711        let os = std::env::consts::OS;
712
713        let mut sys = format!(
714            "You are Hematite {}, a local coding harness working on the user's machine.\n",
715            crate::hematite_version_display()
716        );
717        if professional {
718            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
719        } else {
720            sys.push_str(&format!(
721                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
722                self.species
723            ));
724        }
725        sys.push_str(&format!(
726            "Model: {} | Context: {} tokens. Keep turns focused.\n",
727            current_model, current_context_length
728        ));
729        if is_hematite_native_model(&current_model) {
730            sys.push_str(
731                "Sovereign native: use exact tool JSON. No extra prose in tool calls. \
732                 Raw regex patterns in grep_files, no slash delimiters.\n",
733            );
734        }
735        if cfg!(target_os = "windows") {
736            sys.push_str(&format!(
737                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
738                os
739            ));
740        } else {
741            sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
742        }
743        if brief {
744            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
745        }
746
747        sys.push_str(
748            "\nCORE RULES:\n\
749             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
750             - Verify after edits: run `verify_build` after code changes, before committing.\n\
751             - One tool at a time. Do not batch unrelated tool calls.\n\
752             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
753             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
754             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
755        );
756
757        if !tools.is_empty() {
758            sys.push_str("\n# AVAILABLE TOOLS\n");
759            for tool in tools {
760                let desc: String = tool.function.description.chars().take(120).collect();
761                sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
762            }
763        }
764
765        sys
766    }
767
768    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
769        let current_model = self.current_model();
770        let current_context_length = self.current_context_length();
771        let os = std::env::consts::OS;
772        let mut sys = format!(
773            "You are Hematite {}, a local coding harness working on the user's machine.\n",
774            crate::hematite_version_display()
775        );
776        if professional {
777            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
778        } else {
779            sys.push_str(&format!(
780                "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
781                self.species
782            ));
783        }
784        if !current_model.is_empty() {
785            sys.push_str(&format!(
786                "Loaded model: {} | Context window: {} tokens.\n",
787                current_model, current_context_length
788            ));
789        } else {
790            sys.push_str(&format!(
791                "Context window: {} tokens.\n",
792                current_context_length
793            ));
794        }
795        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
796        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
797        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
798        if cfg!(target_os = "windows") {
799            sys.push_str(&format!(
800                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
801                os
802            ));
803        } else {
804            sys.push_str(&format!(
805                "You are running on {}. Use the native Unix shell conventions.\n",
806                os
807            ));
808        }
809        if brief {
810            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
811        }
812        sys
813    }
814
815    pub fn current_model(&self) -> String {
816        self.cached_model
817            .read()
818            .map(|g| g.clone())
819            .unwrap_or_default()
820    }
821
822    pub fn current_context_length(&self) -> usize {
823        self.cached_context
824            .load(std::sync::atomic::Ordering::Relaxed)
825    }
826
827    pub fn is_compact_context_window(&self) -> bool {
828        let len = self.current_context_length();
829        len <= 16384
830    }
831
832    pub fn gemma_native_formatting_enabled(&self) -> bool {
833        self.gemma_native_formatting
834            .load(std::sync::atomic::Ordering::Relaxed)
835    }
836
837    pub async fn call_with_tools(
838        &self,
839        messages: &[ChatMessage],
840        tools: &[ToolDefinition],
841        // Override the model ID for this call. None = use the live runtime model.
842        model_override: Option<&str>,
843    ) -> Result<
844        (
845            Option<String>,
846            Option<Vec<ToolCallResponse>>,
847            Option<TokenUsage>,
848            Option<String>,
849        ),
850        String,
851    > {
852        let _permit = self
853            .kv_semaphore
854            .acquire()
855            .await
856            .map_err(|e| e.to_string())?;
857
858        let (res, model_name, prepared_messages) = {
859            let p = self.provider.read().await;
860            let model_name = model_override.unwrap_or(&p.current_model()).to_string();
861            let prepared_messages = if should_use_native_formatting(self, &model_name) {
862                prepare_gemma_native_messages(messages)
863            } else {
864                messages.to_vec()
865            };
866            if let Err(detail) = preflight_chat_request(
867                &model_name,
868                &prepared_messages,
869                tools,
870                self.current_context_length(),
871            ) {
872                return Err(format_runtime_failure_message(&detail));
873            }
874            let res = p
875                .call_with_tools(&prepared_messages, tools, model_override)
876                .await
877                .map_err(|e| format_runtime_failure_message(&e))?;
878            (res, model_name, prepared_messages)
879        };
880
881        if let Ok(mut econ) = self.economics.lock() {
882            econ.input_tokens += res.usage.prompt_tokens;
883            econ.output_tokens += res.usage.completion_tokens;
884        }
885
886        let mut content = res.content;
887        let mut tool_calls = res.tool_calls;
888
889        // Post-processing: Gemma 4 / thinking block extraction
890        if let Some(text) = &content {
891            if should_use_native_formatting(self, &model_name) {
892                let native_calls = extract_native_tool_calls(text);
893                if !native_calls.is_empty() {
894                    let mut existing = tool_calls.unwrap_or_default();
895                    existing.extend(native_calls);
896                    tool_calls = Some(existing);
897
898                    let stripped = strip_native_tool_call_text(text);
899                    content = if stripped.trim().is_empty() {
900                        None
901                    } else {
902                        Some(stripped)
903                    };
904                }
905            }
906        }
907
908        // Normalization: Tool arguments
909        if should_use_native_formatting(self, &model_name) {
910            if let Some(calls) = tool_calls.as_mut() {
911                for call in calls.iter_mut() {
912                    normalize_tool_argument_value(
913                        &call.function.name,
914                        &mut call.function.arguments,
915                    );
916                }
917            }
918        }
919
920        if should_use_native_formatting(self, &model_name)
921            && content.is_none()
922            && tool_calls.is_none()
923            && !prepared_messages.is_empty()
924        {
925            return Err(format_runtime_failure_message(
926                "model returned an empty response after native-format message preparation",
927            ));
928        }
929
930        Ok((content, tool_calls, Some(res.usage), res.finish_reason))
931    }
932
933    // ── Streaming call (used for plain-text responses) ────────────────────────
934
935    /// Stream a conversation (no tools). Emits Token/Done/Error events.
936    pub async fn stream_messages(
937        &self,
938        messages: &[ChatMessage],
939        tx: mpsc::Sender<InferenceEvent>,
940    ) -> Result<(), Box<dyn std::error::Error>> {
941        let provider = self.provider.read().await;
942        provider.stream(messages, tx).await
943    }
944
945    /// Single-turn streaming (legacy helper used by startup sequence).
946    pub async fn stream_generation(
947        &self,
948        prompt: &str,
949        snark: u8,
950        chaos: u8,
951        brief: bool,
952        professional: bool,
953        tx: mpsc::Sender<InferenceEvent>,
954    ) -> Result<(), Box<dyn std::error::Error>> {
955        let system =
956            self.build_system_prompt(snark, chaos, brief, professional, &[], None, None, &[]);
957        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
958        self.stream_messages(&messages, tx).await
959    }
960
961    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
962
963    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
964    pub async fn generate_task_worker(
965        &self,
966        prompt: &str,
967        professional: bool,
968    ) -> Result<String, String> {
969        let current_model = self.current_model();
970        let model = self
971            .worker_model
972            .as_deref()
973            .unwrap_or(current_model.as_str());
974        self.generate_task_with_model(prompt, 0.1, professional, model)
975            .await
976    }
977
978    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
979        self.generate_task_with_temp(prompt, 0.1, professional)
980            .await
981    }
982
983    pub async fn generate_task_with_temp(
984        &self,
985        prompt: &str,
986        temp: f32,
987        professional: bool,
988    ) -> Result<String, String> {
989        let current_model = self.current_model();
990        self.generate_task_with_model(prompt, temp, professional, &current_model)
991            .await
992    }
993
994    pub async fn generate_task_with_model(
995        &self,
996        prompt: &str,
997        _temp: f32,
998        professional: bool,
999        model: &str,
1000    ) -> Result<String, String> {
1001        let _permit = self
1002            .kv_semaphore
1003            .acquire()
1004            .await
1005            .map_err(|e| e.to_string())?;
1006
1007        let system =
1008            self.build_system_prompt(self.snark, 50, false, professional, &[], None, None, &[]);
1009        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1010        if let Err(detail) =
1011            preflight_chat_request(model, &messages, &[], self.current_context_length())
1012        {
1013            return Err(format_runtime_failure_message(&detail));
1014        }
1015
1016        let p = self.provider.read().await;
1017        let res = p
1018            .call_with_tools(&messages, &[], Some(model))
1019            .await
1020            .map_err(|e| format_runtime_failure_message(&e))?;
1021
1022        res.content
1023            .ok_or_else(|| "Empty response from model".to_string())
1024    }
1025
1026    // ── History management ────────────────────────────────────────────────────
1027
1028    /// Prune middle turns when context grows too large, keeping system + recent N.
1029    #[allow(dead_code)]
1030    pub fn snip_history(
1031        &self,
1032        turns: &[ChatMessage],
1033        max_tokens_estimate: usize,
1034        keep_recent: usize,
1035    ) -> Vec<ChatMessage> {
1036        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
1037        if total_chars / 4 <= max_tokens_estimate {
1038            return turns.to_vec();
1039        }
1040        let keep = keep_recent.min(turns.len());
1041        let mut snipped = vec![turns[0].clone()];
1042        if turns.len() > keep + 1 {
1043            snipped.push(ChatMessage::system(&format!(
1044                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
1045                turns.len() - keep - 1
1046            )));
1047            snipped.extend_from_slice(&turns[turns.len() - keep..]);
1048        } else {
1049            snipped = turns.to_vec();
1050        }
1051        snipped
1052    }
1053}
1054
1055fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
1056    serde_json::to_vec(value)
1057        .ok()
1058        .map_or(0, |bytes| bytes.len() / 4 + 1)
1059}
1060
1061const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
1062
1063pub fn estimate_message_tokens(message: &ChatMessage) -> usize {
1064    let content_tokens = match &message.content {
1065        MessageContent::Text(s) => s.len() / 4 + 1,
1066        MessageContent::Parts(parts) => parts
1067            .iter()
1068            .map(|part| match part {
1069                ContentPart::Text { text } => text.len() / 4 + 1,
1070                // Image payloads are transported as data URLs, but their base64
1071                // length should not be treated like plain text context pressure.
1072                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
1073            })
1074            .sum(),
1075    };
1076    let tool_tokens: usize = message
1077        .tool_calls
1078        .iter()
1079        .flatten()
1080        .map(|call| (call.function.name.len() + call.function.arguments.to_string().len()) / 4 + 4)
1081        .sum();
1082    content_tokens + tool_tokens + 6
1083}
1084
1085pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
1086    messages.iter().map(estimate_message_tokens).sum()
1087}
1088
1089fn reserved_output_tokens(context_length: usize) -> usize {
1090    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
1091    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
1092}
1093
1094pub fn estimate_prompt_pressure(
1095    messages: &[ChatMessage],
1096    tools: &[ToolDefinition],
1097    context_length: usize,
1098) -> (usize, usize, usize, u8) {
1099    let estimated_input_tokens =
1100        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
1101    let reserved_output = reserved_output_tokens(context_length);
1102    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
1103    let percent = if context_length == 0 {
1104        0
1105    } else {
1106        ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
1107    };
1108    (
1109        estimated_input_tokens,
1110        reserved_output,
1111        estimated_total,
1112        percent,
1113    )
1114}
1115
1116fn preflight_chat_request(
1117    model: &str,
1118    messages: &[ChatMessage],
1119    tools: &[ToolDefinition],
1120    context_length: usize,
1121) -> Result<(), String> {
1122    let (estimated_input_tokens, reserved_output, estimated_total, _) =
1123        estimate_prompt_pressure(messages, tools, context_length);
1124
1125    if estimated_total > context_length {
1126        return Err(format!(
1127            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
1128            model, estimated_input_tokens, reserved_output, estimated_total, context_length
1129        ));
1130    }
1131
1132    Ok(())
1133}
1134
1135/// Walk from CWD up to 4 parent directories and collect project guidance files.
1136/// Looks for rule files plus optional skill guidance such as CLAUDE.md,
1137/// .hematite/rules.md, SKILLS.md, SKILL.md, and .hematite/instructions.md.
1138/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
1139fn load_instruction_files() -> String {
1140    use std::collections::hash_map::DefaultHasher;
1141    use std::collections::HashSet;
1142    use std::hash::{Hash, Hasher};
1143
1144    let Ok(cwd) = std::env::current_dir() else {
1145        return String::new();
1146    };
1147    let mut result = String::new();
1148    let mut seen: HashSet<u64> = HashSet::new();
1149    let mut total_chars: usize = 0;
1150    const MAX_TOTAL: usize = 12_000;
1151    const MAX_PER_FILE: usize = 4_000;
1152
1153    let mut dir = cwd.clone();
1154    for _ in 0..4 {
1155        for name in crate::agent::instructions::PROJECT_GUIDANCE_FILES {
1156            let path = crate::agent::instructions::resolve_guidance_path(&dir, name);
1157            if !path.exists() {
1158                continue;
1159            }
1160            let Ok(content) = std::fs::read_to_string(&path) else {
1161                continue;
1162            };
1163            if content.trim().is_empty() {
1164                continue;
1165            }
1166
1167            let mut hasher = DefaultHasher::new();
1168            content.hash(&mut hasher);
1169            let h = hasher.finish();
1170            if !seen.insert(h) {
1171                continue;
1172            }
1173
1174            let truncated = if content.len() > MAX_PER_FILE {
1175                format!("{}...[truncated]", &content[..MAX_PER_FILE])
1176            } else {
1177                content
1178            };
1179
1180            if total_chars + truncated.len() > MAX_TOTAL {
1181                break;
1182            }
1183            total_chars += truncated.len();
1184            result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
1185        }
1186        match dir.parent().map(|p| p.to_owned()) {
1187            Some(p) => dir = p,
1188            None => break,
1189        }
1190    }
1191
1192    if result.is_empty() {
1193        return String::new();
1194    }
1195    format!("\n\n# Project Instructions And Skills\n{}", result)
1196}
1197
1198fn load_agent_skill_catalog() -> String {
1199    let workspace_root = crate::tools::file_ops::workspace_root();
1200    let config = crate::agent::config::load_config();
1201    let discovery =
1202        crate::agent::instructions::discover_agent_skills(&workspace_root, &config.trust);
1203    crate::agent::instructions::render_skill_catalog(&discovery, 6_000)
1204        .map(|rendered| format!("\n\n{}", rendered))
1205        .unwrap_or_default()
1206}
1207
1208pub fn extract_think_block(text: &str) -> Option<String> {
1209    let lower = text.to_lowercase();
1210
1211    // Official Gemma-4 Native Tags
1212    let open_tag = "<|channel>thought";
1213    let close_tag = "<channel|>";
1214
1215    let start_pos = lower.find(open_tag)?;
1216    let content_start = start_pos + open_tag.len();
1217
1218    let close_pos = lower[content_start..]
1219        .find(close_tag)
1220        .map(|p| content_start + p)
1221        .unwrap_or(text.len());
1222
1223    let content = text[content_start..close_pos].trim();
1224    if content.is_empty() {
1225        None
1226    } else {
1227        Some(content.to_string())
1228    }
1229}
1230
1231pub fn strip_think_blocks(text: &str) -> String {
1232    // Fast-path: strip a stray </think> the model emits at the start when it skips
1233    // the opening tag (common with Qwen after tool calls). Strip it before the lower
1234    // allocation so it can't slip through any branch below.
1235    let text = {
1236        let t = text.trim_start();
1237        if t.to_lowercase().starts_with("</think>") {
1238            &t[8..]
1239        } else {
1240            text
1241        }
1242    };
1243
1244    let lower = text.to_lowercase();
1245
1246    // Use the official Gemma-4 closing tag — answer is everything after it.
1247    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
1248        let answer = text[end..]
1249            .replace("<|channel>thought", "")
1250            .replace("<channel|>", "");
1251        return answer.trim().replace("\n\n\n", "\n\n").to_string();
1252    }
1253
1254    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
1255    let first_open = [
1256        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
1257        lower.find("<think>"),
1258        lower.find("<thinking>"),
1259        lower.find("<thought>"),
1260        lower.find("<|think|>"),
1261    ]
1262    .iter()
1263    .filter_map(|&x| x)
1264    .min();
1265
1266    if let Some(start) = first_open {
1267        if start > 0 {
1268            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
1269        }
1270        return String::new();
1271    }
1272
1273    // If the model outputs 'naked' reasoning without tags:
1274    // Strip leading sentences like "The user asked..." or "I should present..."
1275    // if they appear before actual answer content.
1276    let naked_reasoning_phrases: &[&str] = &[
1277        "the user asked",
1278        "the user is asking",
1279        "the user wants",
1280        "i will structure",
1281        "i should provide",
1282        "i should give",
1283        "i should avoid",
1284        "i should note",
1285        "i should focus",
1286        "i should keep",
1287        "i should respond",
1288        "i should present",
1289        "i should display",
1290        "i should show",
1291        "i need to",
1292        "i can see from",
1293        "without being overly",
1294        "let me ",
1295        "necessary information in my identity",
1296        "was computed successfully",
1297        "computed successfully",
1298    ];
1299    let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
1300    if is_naked_reasoning {
1301        let lines: Vec<&str> = text.lines().collect();
1302        if !lines.is_empty() {
1303            // Skip leading lines that are themselves reasoning prose or blank.
1304            // Stop skipping at the first line that looks like real answer content.
1305            let mut start_idx = 0;
1306            for (i, line) in lines.iter().enumerate() {
1307                let l = line.to_lowercase();
1308                let is_reasoning_line =
1309                    naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
1310                if is_reasoning_line {
1311                    start_idx = i + 1;
1312                } else {
1313                    break;
1314                }
1315            }
1316            if start_idx < lines.len() {
1317                return lines[start_idx..]
1318                    .join("\n")
1319                    .trim()
1320                    .replace("\n\n\n", "\n\n")
1321                    .to_string();
1322            }
1323            // Entire response was reasoning prose — return empty.
1324            return String::new();
1325        }
1326    }
1327
1328    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
1329    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
1330    let cleaned = strip_xml_tool_call_artifacts(text);
1331    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
1332}
1333
1334/// Remove stray XML tool-call closing/opening tags that local models occasionally
1335/// leak into visible output when they start-then-abandon a tool call.
1336fn strip_xml_tool_call_artifacts(text: &str) -> String {
1337    // Tags to remove (both open and close forms, case-insensitive).
1338    const XML_ARTIFACTS: &[&str] = &[
1339        "</tool_call>",
1340        "<tool_call>",
1341        "</function>",
1342        "<function>",
1343        "</parameter>",
1344        "<parameter>",
1345        "</arguments>",
1346        "<arguments>",
1347        "</tool_use>",
1348        "<tool_use>",
1349        "</invoke>",
1350        "<invoke>",
1351        // Stray think/reasoning closing tags that leak after block extraction.
1352        "</think>",
1353        "<thinking>",
1354        "</thought>",
1355        "</thinking>",
1356        // Gemma-style turn markers that Qwen occasionally mirrors back from the system prompt.
1357        "<|turn>system",
1358        "<|turn>user",
1359        "<|turn>assistant",
1360        "<|turn>tool",
1361        "<turn|>",
1362        "<|think|>",
1363        // ChatML EOS/BOS tokens that can leak at end-of-generation.
1364        "<|im_start|>",
1365        "<|im_end|>",
1366        "<|endoftext|>",
1367    ];
1368    let mut out = text.to_string();
1369    for tag in XML_ARTIFACTS {
1370        // Case-insensitive replace
1371        while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
1372            out.drain(pos..pos + tag.len());
1373        }
1374    }
1375    // Collapse any blank lines left behind
1376    out
1377}
1378
1379/// Extract native Gemma-4 <|tool_call|> tags from text.
1380/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
1381pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
1382    use regex::Regex;
1383    let mut results = Vec::new();
1384
1385    // -- Format 1: Gemma 4 Native (call:name{args}) --
1386    let re_call = Regex::new(
1387        r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
1388    ).unwrap();
1389    let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
1390
1391    for cap in re_call.captures_iter(text) {
1392        let name = cap[1].to_string();
1393        let args_str = &cap[2];
1394        let mut arguments = serde_json::Map::new();
1395
1396        for arg_cap in re_arg.captures_iter(args_str) {
1397            let key = arg_cap[1].to_string();
1398            let val_raw = arg_cap
1399                .get(2)
1400                .map(|m| m.as_str())
1401                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
1402                .unwrap_or("")
1403                .trim();
1404            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
1405
1406            let val = if normalized_raw == "true" {
1407                Value::Bool(true)
1408            } else if normalized_raw == "false" {
1409                Value::Bool(false)
1410            } else if let Ok(n) = normalized_raw.parse::<i64>() {
1411                Value::Number(n.into())
1412            } else if let Ok(n) = normalized_raw.parse::<u64>() {
1413                Value::Number(n.into())
1414            } else if let Ok(n) = normalized_raw.parse::<f64>() {
1415                serde_json::Number::from_f64(n)
1416                    .map(Value::Number)
1417                    .unwrap_or(Value::String(normalized_raw.clone()))
1418            } else {
1419                Value::String(normalized_raw)
1420            };
1421
1422            arguments.insert(key, val);
1423        }
1424
1425        results.push(ToolCallResponse {
1426            id: format!("call_{}", rand::random::<u32>()),
1427            call_type: "function".to_string(),
1428            function: ToolCallFn {
1429                name,
1430                arguments: Value::Object(arguments),
1431            },
1432            index: None,
1433        });
1434    }
1435
1436    // -- Format 2: XML (Qwen/Claude style) --
1437    let re_xml_call = Regex::new(
1438        r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
1439    ).unwrap();
1440    let re_xml_param =
1441        Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
1442
1443    for cap in re_xml_call.captures_iter(text) {
1444        let name = cap[1].to_string();
1445        let body = &cap[2];
1446        let mut arguments = serde_json::Map::new();
1447
1448        for p_cap in re_xml_param.captures_iter(body) {
1449            let key = p_cap[1].to_string();
1450            let val_raw = p_cap[2].trim();
1451            let val = if val_raw == "true" {
1452                Value::Bool(true)
1453            } else if val_raw == "false" {
1454                Value::Bool(false)
1455            } else if let Ok(n) = val_raw.parse::<i64>() {
1456                Value::Number(n.into())
1457            } else if let Ok(n) = val_raw.parse::<u64>() {
1458                Value::Number(n.into())
1459            } else {
1460                Value::String(val_raw.to_string())
1461            };
1462            arguments.insert(key, val);
1463        }
1464
1465        results.push(ToolCallResponse {
1466            id: format!("call_{}", rand::random::<u32>()),
1467            call_type: "function".to_string(),
1468            function: ToolCallFn {
1469                name,
1470                arguments: Value::Object(arguments),
1471            },
1472            index: None,
1473        });
1474    }
1475
1476    // -- Format 3: shorthand XML wrapper (<tool_call>name(key="value")</tool_call>) --
1477    let re_short_call =
1478        Regex::new(r#"(?s)<tool_call>\s*([A-Za-z_][A-Za-z0-9_]*)\((.*?)\)\s*</tool_call>"#)
1479            .unwrap();
1480    let re_short_arg = Regex::new(
1481        r#"([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(?:"((?:\\.|[^"])*)"|'((?:\\.|[^'])*)'|([^,\)]+))"#,
1482    )
1483    .unwrap();
1484
1485    for cap in re_short_call.captures_iter(text) {
1486        let name = cap[1].to_string();
1487        let args_str = cap[2].trim();
1488        let mut arguments = serde_json::Map::new();
1489
1490        for arg_cap in re_short_arg.captures_iter(args_str) {
1491            let key = arg_cap[1].to_string();
1492            let val_raw = arg_cap
1493                .get(2)
1494                .or_else(|| arg_cap.get(3))
1495                .or_else(|| arg_cap.get(4))
1496                .map(|m| m.as_str())
1497                .unwrap_or("")
1498                .trim();
1499            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
1500
1501            let val = if normalized_raw == "true" {
1502                Value::Bool(true)
1503            } else if normalized_raw == "false" {
1504                Value::Bool(false)
1505            } else if let Ok(n) = normalized_raw.parse::<i64>() {
1506                Value::Number(n.into())
1507            } else if let Ok(n) = normalized_raw.parse::<u64>() {
1508                Value::Number(n.into())
1509            } else if let Ok(n) = normalized_raw.parse::<f64>() {
1510                serde_json::Number::from_f64(n)
1511                    .map(Value::Number)
1512                    .unwrap_or(Value::String(normalized_raw.clone()))
1513            } else {
1514                Value::String(normalized_raw)
1515            };
1516
1517            arguments.insert(key, val);
1518        }
1519
1520        results.push(ToolCallResponse {
1521            id: format!("call_{}", rand::random::<u32>()),
1522            call_type: "function".to_string(),
1523            function: ToolCallFn {
1524                name,
1525                arguments: Value::Object(arguments),
1526            },
1527            index: None,
1528        });
1529    }
1530
1531    results
1532}
1533
1534pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
1535    let trimmed = raw.trim();
1536    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
1537
1538    let mut value = match serde_json::from_str::<Value>(&candidate) {
1539        Ok(v) => v,
1540        Err(_) => return candidate,
1541    };
1542    normalize_tool_argument_value(tool_name, &mut value);
1543    value.to_string()
1544}
1545
1546pub fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
1547    match value {
1548        Value::String(s) => *s = normalize_string_arg(s),
1549        Value::Array(items) => {
1550            for item in items {
1551                normalize_tool_argument_value(tool_name, item);
1552            }
1553        }
1554        Value::Object(map) => {
1555            for val in map.values_mut() {
1556                normalize_tool_argument_value(tool_name, val);
1557            }
1558            if tool_name == "grep_files" {
1559                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
1560                    *pattern = normalize_regex_pattern(pattern);
1561                }
1562            }
1563            for key in ["path", "extension", "query", "command", "reason"] {
1564                if let Some(Value::String(s)) = map.get_mut(key) {
1565                    *s = normalize_string_arg(s);
1566                }
1567            }
1568        }
1569        _ => {}
1570    }
1571}
1572
1573fn unwrap_json_string_once(input: &str) -> Option<String> {
1574    if input.len() < 2 {
1575        return None;
1576    }
1577    let first = input.chars().next()?;
1578    let last = input.chars().last()?;
1579    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
1580        return None;
1581    }
1582    let inner = &input[1..input.len() - 1];
1583    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
1584    Some(unescaped.trim().to_string())
1585}
1586
1587fn normalize_string_arg(input: &str) -> String {
1588    let mut out = input.trim().to_string();
1589    while out.len() >= 2 {
1590        let mut changed = false;
1591        for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
1592            if out.starts_with(start) && out.ends_with(end) {
1593                out = out[start.len()..out.len() - end.len()].trim().to_string();
1594                changed = true;
1595                break;
1596            }
1597        }
1598        if !changed {
1599            break;
1600        }
1601    }
1602    out
1603}
1604
1605fn normalize_regex_pattern(input: &str) -> String {
1606    let out = normalize_string_arg(input);
1607    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
1608        out[1..out.len() - 1].to_string()
1609    } else {
1610        out
1611    }
1612}
1613
1614fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
1615    let mut system_blocks = Vec::new();
1616    let mut prepared = Vec::new();
1617    let mut seeded = false;
1618
1619    for message in messages {
1620        if message.role == "system" {
1621            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
1622                .trim()
1623                .to_string();
1624            if !cleaned.is_empty() {
1625                system_blocks.push(cleaned);
1626            }
1627            continue;
1628        }
1629
1630        let mut clone = message.clone();
1631        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
1632
1633        if !seeded && message.role == "user" {
1634            let mut merged = String::new();
1635            if !system_blocks.is_empty() {
1636                merged.push_str("System instructions for this turn:\n");
1637                merged.push_str(&system_blocks.join("\n\n"));
1638                merged.push_str("\n\n");
1639            }
1640            merged.push_str(clone.content.as_str());
1641            clone.content = MessageContent::Text(merged);
1642            seeded = true;
1643        }
1644
1645        prepared.push(clone);
1646    }
1647
1648    if !seeded && !system_blocks.is_empty() {
1649        prepared.insert(
1650            0,
1651            ChatMessage::user(&format!(
1652                "System instructions for this turn:\n{}",
1653                system_blocks.join("\n\n")
1654            )),
1655        );
1656    }
1657
1658    prepared
1659}
1660
1661fn strip_legacy_turn_wrappers(text: &str) -> String {
1662    text.replace("<|turn>system\n", "")
1663        .replace("<|turn>user\n", "")
1664        .replace("<|turn>assistant\n", "")
1665        .replace("<|turn>tool\n", "")
1666        .replace("<turn|>", "")
1667        .trim()
1668        .to_string()
1669}
1670
1671pub fn strip_native_tool_call_text(text: &str) -> String {
1672    use regex::Regex;
1673    // Format 1: Gemma 4 Native
1674    let re_call = Regex::new(
1675        r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
1676    ).unwrap();
1677    // Format 2: XML (Qwen/Claude style)
1678    let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
1679    // Format 3: shorthand XML wrapper
1680    let re_short =
1681        Regex::new(r#"(?s)<tool_call>\s*[A-Za-z_][A-Za-z0-9_]*\(.*?\)\s*</tool_call>"#).unwrap();
1682    let re_response =
1683        Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
1684            .unwrap();
1685    let without_calls = re_call.replace_all(text, "");
1686    let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
1687    let without_short = re_short.replace_all(without_xml.as_ref(), "");
1688    re_response
1689        .replace_all(without_short.as_ref(), "")
1690        .trim()
1691        .to_string()
1692}
1693
1694fn resolve_runtime_context(
1695    previous_model: &str,
1696    previous_context: usize,
1697    effective_model: &str,
1698    detected_context: usize,
1699) -> usize {
1700    if effective_model == "no model loaded" || effective_model.trim().is_empty() {
1701        0
1702    } else if detected_context > 0 {
1703        detected_context
1704    } else if effective_model == previous_model {
1705        previous_context
1706    } else {
1707        0
1708    }
1709}
1710
1711#[cfg(test)]
1712mod tests {
1713    use super::*;
1714    use std::fs;
1715
1716    #[test]
1717    fn system_prompt_includes_running_hematite_version() {
1718        let engine = InferenceEngine::new(
1719            "http://localhost:1234/v1".to_string(),
1720            "strategist".to_string(),
1721            0,
1722        )
1723        .expect("engine");
1724
1725        let system = engine.build_system_prompt(0, 50, false, true, &[], None, None, &[]);
1726        assert!(system.contains(crate::HEMATITE_VERSION));
1727    }
1728
1729    #[test]
1730    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
1731        let text = r#"<|channel>thought
1732Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
1733<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
1734
1735        let calls = extract_native_tool_calls(text);
1736        assert_eq!(calls.len(), 1);
1737        assert_eq!(calls[0].function.name, "read_file");
1738
1739        let args: Value = calls[0].function.arguments.clone();
1740        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
1741        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
1742        assert_eq!(
1743            args.get("path").and_then(|v| v.as_str()),
1744            Some("src/ui/tui.rs")
1745        );
1746
1747        let stripped = strip_native_tool_call_text(text);
1748        assert!(!stripped.contains("<|tool_call"));
1749        assert!(!stripped.contains("<tool_call|>"));
1750    }
1751
1752    #[test]
1753    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
1754        let text = r#"<|channel>thought
1755Planning.
1756<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
1757Mapped src.
1758<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
1759Read main.
1760<channel|>"#;
1761
1762        let calls = extract_native_tool_calls(text);
1763        assert_eq!(calls.len(), 2);
1764        assert_eq!(calls[0].function.name, "list_files");
1765        assert_eq!(calls[1].function.name, "read_file");
1766
1767        let stripped = strip_native_tool_call_text(text);
1768        assert!(!stripped.contains("<|tool_call"));
1769        assert!(!stripped.contains("<|tool_response"));
1770        assert!(!stripped.contains("<tool_response|>"));
1771    }
1772
1773    #[test]
1774    fn create_directory_is_treated_as_mutating_repo_write() {
1775        let metadata = tool_metadata_for_name("create_directory");
1776        assert!(metadata.mutates_workspace);
1777        assert!(!metadata.read_only_friendly);
1778    }
1779
1780    #[test]
1781    fn extracts_qwen_xml_tool_calls_from_reasoning() {
1782        let text = r#"Based on the project structure, I need to check the binary.
1783<tool_call>
1784<function=shell>
1785<parameter=command>
1786ls -la hematite.exe
1787</parameter>
1788<parameter=reason>
1789Check if the binary exists
1790</parameter>
1791</function>
1792</tool_call>"#;
1793
1794        let calls = extract_native_tool_calls(text);
1795        assert_eq!(calls.len(), 1);
1796        assert_eq!(calls[0].function.name, "shell");
1797
1798        let args: Value = calls[0].function.arguments.clone();
1799        assert_eq!(
1800            args.get("command").and_then(|v| v.as_str()),
1801            Some("ls -la hematite.exe")
1802        );
1803        assert_eq!(
1804            args.get("reason").and_then(|v| v.as_str()),
1805            Some("Check if the binary exists")
1806        );
1807
1808        let stripped = strip_native_tool_call_text(text);
1809        assert!(!stripped.contains("<tool_call>"));
1810        assert!(!stripped.contains("<function=shell>"));
1811    }
1812
1813    #[test]
1814    fn extracts_shorthand_tool_calls_from_reasoning() {
1815        let text = r#"<thinking>
1816The user wants a search first.
1817</thinking>
1818
1819I'll search before continuing.
1820
1821<tool_call>research_web(query="uefn toolbelt python automation unreal engine fortnite")</tool_call>"#;
1822
1823        let calls = extract_native_tool_calls(text);
1824        assert_eq!(calls.len(), 1);
1825        assert_eq!(calls[0].function.name, "research_web");
1826
1827        let args: Value = calls[0].function.arguments.clone();
1828        assert_eq!(
1829            args.get("query").and_then(|v| v.as_str()),
1830            Some("uefn toolbelt python automation unreal engine fortnite")
1831        );
1832
1833        let stripped = strip_native_tool_call_text(text);
1834        assert!(!stripped.contains("<tool_call>"));
1835        assert!(!stripped.contains("research_web(query="));
1836    }
1837
1838    #[test]
1839    fn strips_thinking_tag_as_reasoning_prefix() {
1840        let cleaned =
1841            strip_think_blocks("<thinking>\nThe user wants a search.\n</thinking>\nVisible answer");
1842        assert_eq!(cleaned, "");
1843    }
1844
1845    #[test]
1846    fn resolve_runtime_context_returns_zero_when_no_model_loaded() {
1847        assert_eq!(
1848            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "no model loaded", 0),
1849            0
1850        );
1851    }
1852
1853    #[test]
1854    fn resolve_runtime_context_preserves_previous_only_for_same_model() {
1855        assert_eq!(
1856            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "qwen/qwen3.5-9b", 0),
1857            32000
1858        );
1859        assert_eq!(
1860            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "bonsai-8b", 0),
1861            0
1862        );
1863    }
1864
1865    #[test]
1866    fn load_instruction_files_includes_workspace_guidance_files() {
1867        let temp = tempfile::tempdir().unwrap();
1868        let previous = std::env::current_dir().unwrap();
1869
1870        fs::write(
1871            temp.path().join("SKILLS.md"),
1872            "# Workspace Skills\n- Prefer API-first changes before UI polish.",
1873        )
1874        .unwrap();
1875
1876        std::env::set_current_dir(temp.path()).unwrap();
1877        let loaded = load_instruction_files();
1878        std::env::set_current_dir(previous).unwrap();
1879
1880        assert!(loaded.contains("SKILLS.md"));
1881        assert!(loaded.contains("Prefer API-first changes before UI polish."));
1882    }
1883
1884    #[test]
1885    fn load_agent_skill_catalog_includes_skill_directory_entries() {
1886        let temp = tempfile::tempdir().unwrap();
1887        let previous = std::env::current_dir().unwrap();
1888
1889        std::fs::create_dir_all(temp.path().join(".agents/skills/code-review")).unwrap();
1890        fs::write(
1891            temp.path().join(".agents/skills/code-review/SKILL.md"),
1892            "---\nname: code-review\ndescription: Review diffs and flag regressions.\ncompatibility: Requires git\n---\n",
1893        )
1894        .unwrap();
1895
1896        std::env::set_current_dir(temp.path()).unwrap();
1897        let loaded = load_agent_skill_catalog();
1898        std::env::set_current_dir(previous).unwrap();
1899
1900        assert!(loaded.contains("Agent Skills Catalog"));
1901        assert!(loaded.contains("code-review"));
1902        assert!(loaded.contains("Review diffs and flag regressions."));
1903    }
1904}
hematite/agent/inference.rs

hematite/agent/
inference.rs