hematite/agent/
inference.rs

1use std::fmt::Write as _;
2
3use serde::Serialize;
4use serde_json::Value;
5use tokio::sync::{mpsc, Semaphore};
6
7pub use crate::agent::economics::{SessionEconomics, ToolRecord};
8pub use crate::agent::types::*;
9
10// ── Engine ────────────────────────────────────────────────────────────────────
11
12pub struct InferenceEngine {
13    pub provider:
14        std::sync::Arc<tokio::sync::RwLock<Box<dyn crate::agent::provider::ModelProvider>>>,
15    pub cached_model: std::sync::Arc<std::sync::RwLock<String>>,
16    pub cached_context: std::sync::Arc<std::sync::atomic::AtomicUsize>,
17    pub base_url: String,
18    pub species: String,
19    pub snark: u8,
20    pub kv_semaphore: Semaphore,
21    pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
22    /// Optional model ID for worker-level tasks (Swarms / research).
23    pub worker_model: Option<String>,
24    /// Opt-in Gemma-native request shaping. Off by default.
25    pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
26    /// Global cancellation token for hard-interrupting the inference stream.
27    pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
28}
29
30pub fn is_hematite_native_model(model: &str) -> bool {
31    let lower = model.to_ascii_lowercase();
32    lower.contains("gemma-4") || lower.contains("gemma4")
33}
34
35fn should_use_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
36    is_hematite_native_model(model) && engine.gemma_native_formatting_enabled()
37}
38
39// ── OpenAI Tool Definition ────────────────────────────────────────────────────
40
41pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
42    if name.starts_with("mcp__") {
43        let lower = name.to_ascii_lowercase();
44        let mutates_workspace = [
45            "__edit",
46            "__write",
47            "__create",
48            "__move",
49            "__delete",
50            "__remove",
51            "__rename",
52            "__replace",
53            "__patch",
54        ]
55        .iter()
56        .any(|needle| lower.contains(needle));
57        return ToolMetadata {
58            category: ToolCategory::External,
59            mutates_workspace,
60            external_surface: true,
61            trust_sensitive: true,
62            read_only_friendly: !mutates_workspace,
63            plan_scope: false,
64        };
65    }
66
67    match name {
68        "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
69            category: ToolCategory::RepoRead,
70            mutates_workspace: false,
71            external_surface: false,
72            trust_sensitive: false,
73            read_only_friendly: true,
74            plan_scope: true,
75        },
76        "create_directory" | "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => {
77            ToolMetadata {
78                category: ToolCategory::RepoWrite,
79                mutates_workspace: true,
80                external_surface: false,
81                trust_sensitive: true,
82                read_only_friendly: false,
83                plan_scope: true,
84            }
85        }
86        "trace_runtime_flow" => ToolMetadata {
87            category: ToolCategory::Architecture,
88            mutates_workspace: false,
89            external_surface: false,
90            trust_sensitive: false,
91            read_only_friendly: true,
92            plan_scope: false,
93        },
94        "describe_toolchain" => ToolMetadata {
95            category: ToolCategory::Toolchain,
96            mutates_workspace: false,
97            external_surface: false,
98            trust_sensitive: false,
99            read_only_friendly: true,
100            plan_scope: false,
101        },
102        "shell" => ToolMetadata {
103            category: ToolCategory::Runtime,
104            mutates_workspace: true,
105            external_surface: false,
106            trust_sensitive: true,
107            read_only_friendly: false,
108            plan_scope: false,
109        },
110        "inspect_host" => ToolMetadata {
111            category: ToolCategory::Runtime,
112            mutates_workspace: false,
113            external_surface: false,
114            trust_sensitive: false,
115            read_only_friendly: true,
116            plan_scope: false,
117        },
118        "resolve_host_issue" => ToolMetadata {
119            category: ToolCategory::Runtime,
120            mutates_workspace: true,
121            external_surface: true,
122            trust_sensitive: true,
123            read_only_friendly: false,
124            plan_scope: false,
125        },
126        "run_hematite_maintainer_workflow" => ToolMetadata {
127            category: ToolCategory::Workflow,
128            mutates_workspace: true,
129            external_surface: false,
130            trust_sensitive: true,
131            read_only_friendly: false,
132            plan_scope: false,
133        },
134        "run_workspace_workflow" => ToolMetadata {
135            category: ToolCategory::Workflow,
136            mutates_workspace: true,
137            external_surface: false,
138            trust_sensitive: true,
139            read_only_friendly: false,
140            plan_scope: false,
141        },
142        "verify_build" => ToolMetadata {
143            category: ToolCategory::Verification,
144            mutates_workspace: false,
145            external_surface: false,
146            trust_sensitive: false,
147            read_only_friendly: true,
148            plan_scope: true,
149        },
150        "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
151            ToolMetadata {
152                category: ToolCategory::Git,
153                mutates_workspace: true,
154                external_surface: false,
155                trust_sensitive: true,
156                read_only_friendly: false,
157                plan_scope: false,
158            }
159        }
160        "research_web" | "fetch_docs" => ToolMetadata {
161            category: ToolCategory::Research,
162            mutates_workspace: false,
163            external_surface: false,
164            trust_sensitive: false,
165            read_only_friendly: true,
166            plan_scope: false,
167        },
168        "vision_analyze" => ToolMetadata {
169            category: ToolCategory::Vision,
170            mutates_workspace: false,
171            external_surface: false,
172            trust_sensitive: false,
173            read_only_friendly: true,
174            plan_scope: false,
175        },
176        "lsp_definitions"
177        | "lsp_references"
178        | "lsp_hover"
179        | "lsp_rename_symbol"
180        | "lsp_get_diagnostics"
181        | "lsp_search_symbol" => ToolMetadata {
182            category: ToolCategory::Lsp,
183            mutates_workspace: false,
184            external_surface: false,
185            trust_sensitive: false,
186            read_only_friendly: true,
187            plan_scope: false,
188        },
189        "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
190            category: ToolCategory::Workflow,
191            mutates_workspace: false,
192            external_surface: false,
193            trust_sensitive: false,
194            read_only_friendly: true,
195            plan_scope: true,
196        },
197        "manage_tasks" => ToolMetadata {
198            category: ToolCategory::Workflow,
199            mutates_workspace: false,
200            external_surface: false,
201            trust_sensitive: false,
202            read_only_friendly: true,
203            plan_scope: false,
204        },
205        _ => ToolMetadata {
206            category: ToolCategory::Other,
207            mutates_workspace: false,
208            external_surface: false,
209            trust_sensitive: false,
210            read_only_friendly: true,
211            plan_scope: false,
212        },
213    }
214}
215// ── Message types migrated to types.rs ────────────────────────────────────────
216
217// ── HTTP request / response shapes ───────────────────────────────────────────
218
219const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
220const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
221
222fn is_tiny_context_window(context_length: usize) -> bool {
223    context_length <= 8_192
224}
225
226fn is_compact_context_window(context_length: usize) -> bool {
227    context_length > 8_192 && context_length <= 49_152
228}
229
230pub fn is_compact_context_window_pub(context_length: usize) -> bool {
231    is_compact_context_window(context_length)
232}
233
234fn is_provider_context_limit_detail(lower: &str) -> bool {
235    (lower.contains("n_keep") && lower.contains("n_ctx"))
236        || lower.contains("context length")
237        || lower.contains("keep from the initial prompt")
238        || lower.contains("prompt is greater than the context length")
239        || lower.contains("exceeds the context window")
240}
241
242fn classify_runtime_failure_tag(detail: &str) -> &'static str {
243    let lower = detail.to_ascii_lowercase();
244    if lower.contains("context_window_blocked")
245        || lower.contains("context ceiling reached")
246        || lower.contains("exceeds the")
247        || is_provider_context_limit_detail(&lower)
248    {
249        "context_window"
250    } else if lower.contains("empty response from model")
251        || lower.contains("model returned an empty response")
252    {
253        "empty_model_response"
254    } else if lower.contains("action blocked:")
255        || lower.contains("access denied")
256        || lower.contains("declined by user")
257    {
258        "tool_policy_blocked"
259    } else {
260        "provider_degraded"
261    }
262}
263
264fn runtime_failure_guidance(tag: &str) -> &'static str {
265    match tag {
266        "context_window" => {
267            "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
268        }
269        "empty_model_response" => {
270            "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
271        }
272        "tool_policy_blocked" => {
273            "Stay inside the allowed workflow or switch modes before retrying."
274        }
275        _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
276    }
277}
278
279fn format_runtime_failure_message(detail: &str) -> String {
280    let tag = classify_runtime_failure_tag(detail);
281    format!(
282        "[failure:{}] {} Detail: {}",
283        tag,
284        runtime_failure_guidance(tag),
285        detail.trim()
286    )
287}
288
289// ── Events pushed to the TUI (migrated to types.rs) ──────────────────────────
290
291// ── Engine implementation ─────────────────────────────────────────────────────
292
293impl InferenceEngine {
294    pub fn new(
295        api_url: String,
296        species: String,
297        snark: u8,
298    ) -> Result<Self, Box<dyn std::error::Error>> {
299        let client = reqwest::Client::builder()
300            .timeout(std::time::Duration::from_secs(180))
301            .build()?;
302
303        let base_url = {
304            let trimmed = api_url.trim_end_matches('/');
305            if let Some(scheme_end) = trimmed.find("://") {
306                let after_scheme = &trimmed[scheme_end + 3..];
307                if let Some(path_start) = after_scheme.find('/') {
308                    format!(
309                        "{}://{}",
310                        &trimmed[..scheme_end],
311                        &after_scheme[..path_start]
312                    )
313                } else {
314                    trimmed.to_string()
315                }
316            } else {
317                trimmed.to_string()
318            }
319        };
320
321        let api_url_full = if api_url.ends_with("/chat/completions") {
322            api_url
323        } else if api_url.ends_with("/") {
324            format!("{}chat/completions", api_url)
325        } else {
326            format!("{}/chat/completions", api_url)
327        };
328
329        let lms = crate::agent::lms::LmsHarness::new();
330        let ollama_harness = crate::agent::ollama::OllamaHarness::new(&base_url);
331
332        let provider = if base_url.contains("11434") {
333            Box::new(crate::agent::provider::OllamaProvider {
334                client: client.clone(),
335                base_url: base_url.clone(),
336                model: String::new(),
337                context_length: 8192,
338                embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
339                ollama: ollama_harness,
340            }) as Box<dyn crate::agent::provider::ModelProvider>
341        } else {
342            Box::new(crate::agent::provider::LmsProvider {
343                client: client.clone(),
344                api_url: api_url_full,
345                base_url: base_url.clone(),
346                model: String::new(),
347                context_length: 0,
348                lms,
349            }) as Box<dyn crate::agent::provider::ModelProvider>
350        };
351
352        Ok(Self {
353            provider: std::sync::Arc::new(tokio::sync::RwLock::new(provider)),
354            cached_model: std::sync::Arc::new(std::sync::RwLock::new(String::new())),
355            cached_context: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
356            base_url: base_url.clone(),
357            species: species.clone(),
358            snark,
359            kv_semaphore: Semaphore::new(3),
360            economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
361            worker_model: None,
362            gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
363            cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
364        })
365    }
366
367    pub fn set_gemma_native_formatting(&self, enabled: bool) {
368        self.gemma_native_formatting
369            .store(enabled, std::sync::atomic::Ordering::SeqCst);
370    }
371
372    pub async fn health_check(&self) -> bool {
373        let p = self.provider.read().await;
374        p.health_check().await
375    }
376
377    pub async fn provider_name(&self) -> String {
378        let p = self.provider.read().await;
379        p.name().to_string()
380    }
381
382    pub async fn get_loaded_model(&self) -> Option<String> {
383        let p = self.provider.read().await;
384        match p.detect_model().await {
385            Ok(m) if m.is_empty() => Some("".to_string()),
386            Ok(m) => Some(m),
387            Err(_) => None,
388        }
389    }
390
391    pub async fn get_embedding_model(&self) -> Option<String> {
392        let p = self.provider.read().await;
393        p.get_embedding_model().await
394    }
395
396    pub async fn load_model(&self, model_id: &str) -> Result<(), String> {
397        let p = self.provider.read().await;
398        p.load_model(model_id).await
399    }
400
401    pub async fn load_model_with_context(
402        &self,
403        model_id: &str,
404        context_length: Option<usize>,
405    ) -> Result<(), String> {
406        let p = self.provider.read().await;
407        p.load_model_with_context(model_id, context_length).await
408    }
409
410    pub async fn load_embedding_model(&self, model_id: &str) -> Result<(), String> {
411        let p = self.provider.read().await;
412        p.load_embedding_model(model_id).await
413    }
414
415    pub async fn list_provider_models(
416        &self,
417        kind: crate::agent::provider::ProviderModelKind,
418        loaded_only: bool,
419    ) -> Result<Vec<String>, String> {
420        let p = self.provider.read().await;
421        p.list_models(kind, loaded_only).await
422    }
423
424    pub async fn unload_model(&self, model_id: Option<&str>, all: bool) -> Result<String, String> {
425        let p = self.provider.read().await;
426        p.unload_model(model_id, all).await
427    }
428
429    pub async fn unload_embedding_model(&self, model_id: Option<&str>) -> Result<String, String> {
430        let p = self.provider.read().await;
431        p.unload_embedding_model(model_id).await
432    }
433
434    pub async fn prewarm(&self) -> Result<(), String> {
435        let p = self.provider.read().await;
436        p.prewarm().await
437    }
438
439    pub async fn detect_context_length(&self) -> usize {
440        let p = self.provider.read().await;
441        p.detect_context_length().await
442    }
443
444    pub async fn set_runtime_profile(&self, model: &str, context_length: usize) {
445        if let Ok(mut guard) = self.cached_model.write() {
446            *guard = model.to_string();
447        }
448        self.cached_context
449            .store(context_length, std::sync::atomic::Ordering::SeqCst);
450
451        let mut p = self.provider.write().await;
452        p.set_runtime_profile(model, context_length);
453    }
454
455    pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
456        let previous_model = self.current_model();
457        let previous_context = self.current_context_length();
458
459        let detected_model = match self.get_loaded_model().await {
460            Some(m) if !m.is_empty() => m,
461            Some(_) => "no model loaded".to_string(),
462            None => previous_model.clone(),
463        };
464
465        let detected_context = self.detect_context_length().await;
466        let effective_model = if detected_model.is_empty() {
467            previous_model.clone()
468        } else {
469            detected_model
470        };
471        let effective_context = resolve_runtime_context(
472            &previous_model,
473            previous_context,
474            &effective_model,
475            detected_context,
476        );
477
478        let changed = effective_model != previous_model || effective_context != previous_context;
479        if changed {
480            self.set_runtime_profile(&effective_model, effective_context)
481                .await;
482        }
483
484        Some((effective_model, effective_context, changed))
485    }
486
487    pub fn build_system_prompt(
488        &self,
489        snark: u8,
490        chaos: u8,
491        brief: bool,
492        professional: bool,
493        tools: &[ToolDefinition],
494        reasoning_history: Option<&str>,
495        environment_summary: Option<&str>,
496        mcp_tools: &[crate::agent::mcp::McpTool],
497    ) -> String {
498        let mut sys = self.build_system_prompt_legacy(
499            snark,
500            chaos,
501            brief,
502            professional,
503            tools,
504            reasoning_history,
505            environment_summary,
506        );
507
508        if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
509            sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
510            sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
511            for tool in mcp_tools {
512                let description = tool
513                    .description
514                    .as_deref()
515                    .unwrap_or("No description provided.");
516                let _ = writeln!(sys, "- {}: {}", tool.name, description);
517            }
518        }
519
520        sys
521    }
522
523    pub fn build_system_prompt_legacy(
524        &self,
525        snark: u8,
526        _chaos: u8,
527        brief: bool,
528        professional: bool,
529        tools: &[ToolDefinition],
530        reasoning_history: Option<&str>,
531        environment_summary: Option<&str>,
532    ) -> String {
533        let current_context_length = self.current_context_length();
534        if is_tiny_context_window(current_context_length) {
535            return self.build_system_prompt_tiny(brief, professional);
536        }
537        if is_compact_context_window(current_context_length) {
538            return self.build_system_prompt_compact(brief, professional, tools);
539        }
540
541        // Hematite bootstrap: keep reasoning disciplined without leaking scaffolding into user-facing replies.
542        let mut sys = String::from("## HEMATITE OPERATING PROTOCOL\n\
543                                     - You are Hematite, a local coding system working on the user's machine.\n\
544                                     - The running Hematite build is ");
545        sys.push_str(&crate::hematite_version_display());
546        sys.push_str(".\n\
547                                     - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
548                                     - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
549                                     - For simple questions, answer briefly in plain language.\n\
550                                     - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
551                                     - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
552                                     - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
553                                     - Keep internal reasoning inside channel delimiters.\n\
554                                     - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\n");
555
556        if let Some(history) = reasoning_history {
557            if !history.is_empty() {
558                sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
559                sys.push_str(history);
560                sys.push_str("\n\n");
561            }
562        }
563
564        // ADAPTIVE THOUGHT EFFICIENCY (Gemma-4 Native)
565        if brief {
566            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
567                          - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
568                          - Depth: Surface-level verification only.\n\n");
569        } else {
570            sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
571                          - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
572                          - Depth: Full multi-step derivation required.\n\n");
573        }
574
575        // IDENTITY & ENVIRONMENT
576        let os = std::env::consts::OS;
577        if let Some(summary) = environment_summary {
578            sys.push_str("## HOST ENVIRONMENT\n");
579            sys.push_str(summary);
580            sys.push_str("\n\n");
581        }
582
583        if professional {
584            let _ = writeln!(
585                sys,
586                "You are Hematite, a local coding system running on {}. \
587                 The TUI is one interface layer, not your whole identity. \
588                 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
589                 Skip filler and keep the focus on the work.",
590                os
591            );
592        } else {
593            let _ = writeln!(sys,
594                "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
595                 The terminal UI is only one surface of the system. \
596                 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
597                 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.",
598                self.species, snark, os
599            );
600        }
601
602        // Inject loaded model and context window so the model knows its own budget.
603        let current_model = self.current_model();
604        if !current_model.is_empty() {
605            let _ = write!(
606                sys,
607                "Loaded model: {} | Context window: {} tokens. \
608                 Calibrate response length and tool-call depth to fit within this budget.\n\n",
609                current_model, current_context_length
610            );
611            if is_hematite_native_model(&current_model) {
612                sys.push_str(
613                    "Sovereign native note: prefer exact tool JSON with no extra prose when calling tools. \
614                     Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
615                     For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
616                );
617            }
618        } else {
619            let _ = write!(sys,
620                "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
621                current_context_length
622            );
623        }
624
625        // PROTOCOL & TOOLS
626        let shell_desc = if cfg!(target_os = "windows") {
627            "[EXTERNAL SHELL]: `powershell` (Windows).\n\
628             - Use ONLY for builds, tests, or file migrations. \n\
629             - You MUST use the `powershell` tool directly. \n\
630             - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
631        } else {
632            "[EXTERNAL SHELL]: `bash` (Unix).\n\
633             - Use ONLY for builds, tests, or file migrations. \n\
634             - NEVER wrap bash in other shells. \n\n"
635        };
636
637        sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
638                      [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
639                      - These are the ONLY way to explore and modify code. \n\
640                      - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
641        sys.push_str(shell_desc);
642
643        // ANTI-LOOPING & SELF-AUDIT
644        sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
645                      SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
646
647        sys.push_str("## THE COMPUTATIONAL RESEARCH MANDATE\n\
648                      - You are a Lead Computational Researcher and Senior Scientist.\n\
649                      - ZERO-TRUST MATH: You never guess results for math, physics, or algorithmic complexity.\n\
650                      - UNIT-SAFETY: All physical calculations must use `scientific_compute(mode='units')` to ensure dimensional consistency.\n\
651                      - SYMBOLIC PROOF: Use `scientific_compute(mode='symbolic')` for formal algebraic derivations and multi-variable proofs. Set `latex: true` for formal presentation.\n\
652                      - EMPIRICAL AUDITING: All algorithmic performance claims must be verified with `scientific_compute(mode='complexity')` before being finalized.\n\
653                      - SCIENTIFIC MEMORY (LEDGER): Use `scientific_compute(mode='ledger')` to persist long-form derivations, constants, and theorem steps to `.hematite/docs/scientific_ledger.md`. This ledger is RAG-indexed by The Vein, giving you persistent cross-session memory for project math.\n\
654                      - DATASET COMPUTATION: Use `scientific_compute(mode='dataset')` to perform high-precision calculations on SQL results (CSV/DB/JSON). This bridges data science and formal research.\n\
655                      - LIGHTWEIGHT SANDBOX: Prioritize pure Python implementations for all research tasks. Do NOT attempt to import heavy external libraries like 'numpy', 'scipy', or 'pandas' unless you have verified they are available or the user explicitly asks to work in a specific heavy environment or venv.\n\
656                      - Every result must be backed by the executable logic used to prove it.\n\n");
657
658        // Consolidated: All directives are now handled by the authoritative prompt.rs builder.
659        sys.push_str("## TURN ADVISORY\n");
660        if brief {
661            sys.push_str("- BRIEF MODE: Respond with ONE concise sentence/block unless more code is required.\n");
662        }
663        sys.push_str("- INTERNAL REASONING: Plan your move in the thought channel first.\n");
664
665        // Scaffolding protocol — enforces build validation after project creation.
666        sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
667            2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
668            3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
669            4. Fix all errors before declaring success.\n\n\
670            ## PRE-FLIGHT SCOPING PROTOCOL\n\
671            Before attempting any multi-file task or complex refactor:\n\
672            1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
673            2. Use `auto_pin_context` to keep those files in active context.\n\
674            3. Only then proceed to deeper edits or research.\n\n\
675            ## REFACTORING PROTOCOL\n\
676            When modifying existing code or renaming symbols:\n\
677            1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
678            2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
679            3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
680
681        // Inject CLAUDE.md / instruction files from the project directory.
682        sys.push_str(&load_instruction_files());
683        sys.push_str(&load_agent_skill_catalog());
684
685        // Inject cross-session memories synthesized by DeepReflect.
686        sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
687
688        // Native Gemma-4 Tool Declarations
689        if !tools.is_empty() {
690            sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
691            for tool in tools {
692                let schema = serde_json::to_string(&tool.function.parameters)
693                    .unwrap_or_else(|_| "{}".to_string());
694                let _ = writeln!(
695                    sys,
696                    "<|tool>declaration:{}{{{}<tool|>",
697                    tool.function.name, schema
698                );
699                let _ = writeln!(sys, "// {})", tool.function.description);
700            }
701        }
702
703        sys
704    }
705
706    fn build_system_prompt_compact(
707        &self,
708        brief: bool,
709        professional: bool,
710        tools: &[ToolDefinition],
711    ) -> String {
712        // Compact tier: fits in 16k context. Keeps tool names + one-line descriptions
713        // but skips full JSON schemas, verbose protocol sections, and CLAUDE.md injection.
714        let current_model = self.current_model();
715        let current_context_length = self.current_context_length();
716        let os = std::env::consts::OS;
717
718        let mut sys = format!(
719            "You are Hematite {}, a local coding harness working on the user's machine.\n",
720            crate::hematite_version_display()
721        );
722        if professional {
723            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
724        } else {
725            let _ = writeln!(
726                sys,
727                "You are a [{}] local AI coding system. Be direct, concise, and technical.",
728                self.species
729            );
730        }
731        let _ = writeln!(
732            sys,
733            "Model: {} | Context: {} tokens. Keep turns focused.",
734            current_model, current_context_length
735        );
736        if is_hematite_native_model(&current_model) {
737            sys.push_str(
738                "Sovereign native: use exact tool JSON. No extra prose in tool calls. \
739                 Raw regex patterns in grep_files, no slash delimiters.\n",
740            );
741        }
742        if cfg!(target_os = "windows") {
743            let _ = writeln!(
744                sys,
745                "OS: {}. Use PowerShell for shell. Never bash or /dev/null.",
746                os
747            );
748        } else {
749            let _ = writeln!(sys, "OS: {}. Use native Unix shell.", os);
750        }
751        if brief {
752            sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
753        }
754
755        sys.push_str(
756            "\nCORE RULES:\n\
757             - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
758             - Verify after edits: run `verify_build` after code changes, before committing.\n\
759             - One tool at a time. Do not batch unrelated tool calls.\n\
760             - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
761             - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
762             - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
763        );
764
765        if !tools.is_empty() {
766            sys.push_str("\n# AVAILABLE TOOLS\n");
767            for tool in tools {
768                let desc: String = tool.function.description.chars().take(120).collect();
769                let _ = writeln!(sys, "- {}: {}", tool.function.name, desc);
770            }
771        }
772
773        sys
774    }
775
776    fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
777        let current_model = self.current_model();
778        let current_context_length = self.current_context_length();
779        let os = std::env::consts::OS;
780        let mut sys = format!(
781            "You are Hematite {}, a local coding harness working on the user's machine.\n",
782            crate::hematite_version_display()
783        );
784        if professional {
785            sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
786        } else {
787            let _ = writeln!(
788                sys,
789                "You are a [{}] local AI coding system. Be direct, concise, and technical.",
790                self.species
791            );
792        }
793        if !current_model.is_empty() {
794            let _ = writeln!(
795                sys,
796                "Loaded model: {} | Context window: {} tokens.",
797                current_model, current_context_length
798            );
799        } else {
800            let _ = writeln!(sys, "Context window: {} tokens.", current_context_length);
801        }
802        sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
803        sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
804        sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
805        if cfg!(target_os = "windows") {
806            let _ = writeln!(sys,
807                "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.",
808                os
809            );
810        } else {
811            let _ = writeln!(
812                sys,
813                "You are running on {}. Use the native Unix shell conventions.",
814                os
815            );
816        }
817        if brief {
818            sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
819        }
820        sys
821    }
822
823    pub fn current_model(&self) -> String {
824        self.cached_model
825            .read()
826            .map(|g| g.clone())
827            .unwrap_or_default()
828    }
829
830    pub fn current_context_length(&self) -> usize {
831        self.cached_context
832            .load(std::sync::atomic::Ordering::Relaxed)
833    }
834
835    pub fn is_compact_context_window(&self) -> bool {
836        let len = self.current_context_length();
837        len <= 16384
838    }
839
840    pub fn gemma_native_formatting_enabled(&self) -> bool {
841        self.gemma_native_formatting
842            .load(std::sync::atomic::Ordering::Relaxed)
843    }
844
845    pub async fn call_with_tools(
846        &self,
847        messages: &[ChatMessage],
848        tools: &[ToolDefinition],
849        // Override the model ID for this call. None = use the live runtime model.
850        model_override: Option<&str>,
851    ) -> Result<
852        (
853            Option<String>,
854            Option<Vec<ToolCallResponse>>,
855            Option<TokenUsage>,
856            Option<String>,
857        ),
858        String,
859    > {
860        let _permit = self
861            .kv_semaphore
862            .acquire()
863            .await
864            .map_err(|e| e.to_string())?;
865
866        let (res, model_name, prepared_messages) = {
867            let p = self.provider.read().await;
868            let model_name = model_override.unwrap_or(&p.current_model()).to_string();
869            let prepared_messages = if should_use_native_formatting(self, &model_name) {
870                prepare_gemma_native_messages(messages)
871            } else {
872                messages.to_vec()
873            };
874            if let Err(detail) = preflight_chat_request(
875                &model_name,
876                &prepared_messages,
877                tools,
878                self.current_context_length(),
879            ) {
880                return Err(format_runtime_failure_message(&detail));
881            }
882            let res = p
883                .call_with_tools(&prepared_messages, tools, model_override)
884                .await
885                .map_err(|e| format_runtime_failure_message(&e))?;
886            (res, model_name, prepared_messages)
887        };
888
889        if let Ok(mut econ) = self.economics.lock() {
890            econ.input_tokens += res.usage.prompt_tokens;
891            econ.output_tokens += res.usage.completion_tokens;
892        }
893
894        let mut content = res.content;
895        let mut tool_calls = res.tool_calls;
896
897        // Post-processing: Gemma 4 / thinking block extraction
898        if let Some(text) = &content {
899            if should_use_native_formatting(self, &model_name) {
900                let native_calls = extract_native_tool_calls(text);
901                if !native_calls.is_empty() {
902                    let mut existing = tool_calls.unwrap_or_default();
903                    existing.extend(native_calls);
904                    tool_calls = Some(existing);
905
906                    let stripped = strip_native_tool_call_text(text);
907                    content = if stripped.trim().is_empty() {
908                        None
909                    } else {
910                        Some(stripped)
911                    };
912                }
913            }
914        }
915
916        // Normalization: Tool arguments
917        if should_use_native_formatting(self, &model_name) {
918            if let Some(calls) = tool_calls.as_mut() {
919                for call in calls.iter_mut() {
920                    normalize_tool_argument_value(
921                        &call.function.name,
922                        &mut call.function.arguments,
923                    );
924                }
925            }
926        }
927
928        if should_use_native_formatting(self, &model_name)
929            && content.is_none()
930            && tool_calls.is_none()
931            && !prepared_messages.is_empty()
932        {
933            return Err(format_runtime_failure_message(
934                "model returned an empty response after native-format message preparation",
935            ));
936        }
937
938        Ok((content, tool_calls, Some(res.usage), res.finish_reason))
939    }
940
941    // ── Streaming call (used for plain-text responses) ────────────────────────
942
943    /// Stream a conversation (no tools). Emits Token/Done/Error events.
944    pub async fn stream_messages(
945        &self,
946        messages: &[ChatMessage],
947        tx: mpsc::Sender<InferenceEvent>,
948    ) -> Result<(), Box<dyn std::error::Error>> {
949        let provider = self.provider.read().await;
950        provider.stream(messages, tx).await
951    }
952
953    /// Single-turn streaming (legacy helper used by startup sequence).
954    pub async fn stream_generation(
955        &self,
956        prompt: &str,
957        snark: u8,
958        chaos: u8,
959        brief: bool,
960        professional: bool,
961        tx: mpsc::Sender<InferenceEvent>,
962    ) -> Result<(), Box<dyn std::error::Error>> {
963        let system =
964            self.build_system_prompt(snark, chaos, brief, professional, &[], None, None, &[]);
965        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
966        self.stream_messages(&messages, tx).await
967    }
968
969    // ── Swarm worker helpers (non-streaming) ──────────────────────────────────
970
971    /// Runs a task using the `worker_model` if set, otherwise falls back to the main `model`.
972    pub async fn generate_task_worker(
973        &self,
974        prompt: &str,
975        professional: bool,
976    ) -> Result<String, String> {
977        let current_model = self.current_model();
978        let model = self
979            .worker_model
980            .as_deref()
981            .unwrap_or(current_model.as_str());
982        self.generate_task_with_model(prompt, 0.1, professional, model)
983            .await
984    }
985
986    pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
987        self.generate_task_with_temp(prompt, 0.1, professional)
988            .await
989    }
990
991    pub async fn generate_task_with_temp(
992        &self,
993        prompt: &str,
994        temp: f32,
995        professional: bool,
996    ) -> Result<String, String> {
997        let current_model = self.current_model();
998        self.generate_task_with_model(prompt, temp, professional, &current_model)
999            .await
1000    }
1001
1002    pub async fn generate_task_with_model(
1003        &self,
1004        prompt: &str,
1005        _temp: f32,
1006        professional: bool,
1007        model: &str,
1008    ) -> Result<String, String> {
1009        let _permit = self
1010            .kv_semaphore
1011            .acquire()
1012            .await
1013            .map_err(|e| e.to_string())?;
1014
1015        let system =
1016            self.build_system_prompt(self.snark, 50, false, professional, &[], None, None, &[]);
1017        let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1018        if let Err(detail) =
1019            preflight_chat_request(model, &messages, &[], self.current_context_length())
1020        {
1021            return Err(format_runtime_failure_message(&detail));
1022        }
1023
1024        let p = self.provider.read().await;
1025        let res = p
1026            .call_with_tools(&messages, &[], Some(model))
1027            .await
1028            .map_err(|e| format_runtime_failure_message(&e))?;
1029
1030        res.content
1031            .ok_or_else(|| "Empty response from model".to_string())
1032    }
1033
1034    // ── History management ────────────────────────────────────────────────────
1035
1036    /// Prune middle turns when context grows too large, keeping system + recent N.
1037    #[allow(dead_code)]
1038    pub fn snip_history(
1039        &self,
1040        turns: &[ChatMessage],
1041        max_tokens_estimate: usize,
1042        keep_recent: usize,
1043    ) -> Vec<ChatMessage> {
1044        let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
1045        if total_chars / 4 <= max_tokens_estimate {
1046            return turns.to_vec();
1047        }
1048        let keep = keep_recent.min(turns.len());
1049        let mut snipped = vec![turns[0].clone()];
1050        if turns.len() > keep + 1 {
1051            snipped.push(ChatMessage::system(&format!(
1052                "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
1053                turns.len() - keep - 1
1054            )));
1055            snipped.extend_from_slice(&turns[turns.len() - keep..]);
1056        } else {
1057            snipped = turns.to_vec();
1058        }
1059        snipped
1060    }
1061}
1062
1063fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
1064    serde_json::to_vec(value)
1065        .ok()
1066        .map_or(0, |bytes| bytes.len() / 4 + 1)
1067}
1068
1069const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
1070
1071pub fn estimate_message_tokens(message: &ChatMessage) -> usize {
1072    let content_tokens = match &message.content {
1073        MessageContent::Text(s) => s.len() / 4 + 1,
1074        MessageContent::Parts(parts) => parts
1075            .iter()
1076            .map(|part| match part {
1077                ContentPart::Text { text } => text.len() / 4 + 1,
1078                // Image payloads are transported as data URLs, but their base64
1079                // length should not be treated like plain text context pressure.
1080                ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
1081            })
1082            .sum(),
1083    };
1084    let tool_tokens: usize = message
1085        .tool_calls
1086        .iter()
1087        .flatten()
1088        .map(|call| (call.function.name.len() + call.function.arguments.to_string().len()) / 4 + 4)
1089        .sum();
1090    content_tokens + tool_tokens + 6
1091}
1092
1093pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
1094    messages.iter().map(estimate_message_tokens).sum()
1095}
1096
1097fn reserved_output_tokens(context_length: usize) -> usize {
1098    let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
1099    proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
1100}
1101
1102pub fn estimate_prompt_pressure(
1103    messages: &[ChatMessage],
1104    tools: &[ToolDefinition],
1105    context_length: usize,
1106) -> (usize, usize, usize, u8) {
1107    let estimated_input_tokens =
1108        estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
1109    let reserved_output = reserved_output_tokens(context_length);
1110    let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
1111    let percent = (estimated_total.saturating_mul(100))
1112        .checked_div(context_length)
1113        .unwrap_or(0)
1114        .min(100) as u8;
1115    (
1116        estimated_input_tokens,
1117        reserved_output,
1118        estimated_total,
1119        percent,
1120    )
1121}
1122
1123fn preflight_chat_request(
1124    model: &str,
1125    messages: &[ChatMessage],
1126    tools: &[ToolDefinition],
1127    context_length: usize,
1128) -> Result<(), String> {
1129    let (estimated_input_tokens, reserved_output, estimated_total, _) =
1130        estimate_prompt_pressure(messages, tools, context_length);
1131
1132    if estimated_total > context_length {
1133        return Err(format!(
1134            "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
1135            model, estimated_input_tokens, reserved_output, estimated_total, context_length
1136        ));
1137    }
1138
1139    Ok(())
1140}
1141
1142/// Walk from CWD up to 4 parent directories and collect project guidance files.
1143/// Looks for rule files plus optional skill guidance such as CLAUDE.md,
1144/// .hematite/rules.md, SKILLS.md, SKILL.md, and .hematite/instructions.md.
1145/// Deduplicates by content hash; truncates at 4KB per file, 12KB total.
1146/// Result is cached by CWD so repeated per-turn calls pay zero I/O after the first.
1147fn load_instruction_files() -> String {
1148    use std::collections::hash_map::DefaultHasher;
1149    use std::collections::HashSet;
1150    use std::hash::{Hash, Hasher};
1151
1152    let Ok(cwd) = std::env::current_dir() else {
1153        return String::new();
1154    };
1155
1156    // Fast path: cache keyed by CWD — instruction files are session-constant.
1157    static CACHE: std::sync::Mutex<Option<(String, String)>> = std::sync::Mutex::new(None);
1158    let cwd_key = cwd.to_string_lossy().into_owned();
1159    if let Ok(g) = CACHE.lock() {
1160        if let Some((ref k, ref v)) = *g {
1161            if *k == cwd_key {
1162                return v.clone();
1163            }
1164        }
1165    }
1166    let mut result = String::with_capacity(4096);
1167    let mut seen: HashSet<u64> = HashSet::new();
1168    let mut total_chars: usize = 0;
1169    const MAX_TOTAL: usize = 12_000;
1170    const MAX_PER_FILE: usize = 4_000;
1171
1172    let mut dir = cwd.clone();
1173    for _ in 0..4 {
1174        for name in crate::agent::instructions::PROJECT_GUIDANCE_FILES {
1175            let path = crate::agent::instructions::resolve_guidance_path(&dir, name);
1176            if !path.exists() {
1177                continue;
1178            }
1179            let Ok(content) = std::fs::read_to_string(&path) else {
1180                continue;
1181            };
1182            if content.trim().is_empty() {
1183                continue;
1184            }
1185
1186            let mut hasher = DefaultHasher::new();
1187            content.hash(&mut hasher);
1188            let h = hasher.finish();
1189            if !seen.insert(h) {
1190                continue;
1191            }
1192
1193            let truncated = if content.len() > MAX_PER_FILE {
1194                format!("{}...[truncated]", &content[..MAX_PER_FILE])
1195            } else {
1196                content
1197            };
1198
1199            if total_chars + truncated.len() > MAX_TOTAL {
1200                break;
1201            }
1202            total_chars += truncated.len();
1203            let _ = write!(result, "\n--- {} ---\n{}\n", path.display(), truncated);
1204        }
1205        match dir.parent().map(|p| p.to_owned()) {
1206            Some(p) => dir = p,
1207            None => break,
1208        }
1209    }
1210
1211    let output = if result.is_empty() {
1212        String::new()
1213    } else {
1214        format!("\n\n# Project Instructions And Skills\n{}", result)
1215    };
1216    if let Ok(mut g) = CACHE.lock() {
1217        *g = Some((cwd_key, output.clone()));
1218    }
1219    output
1220}
1221
1222fn load_agent_skill_catalog() -> String {
1223    static CACHE: std::sync::Mutex<Option<(String, String)>> = std::sync::Mutex::new(None);
1224    let workspace_root = crate::tools::file_ops::workspace_root();
1225    let cwd_key = workspace_root.to_string_lossy().into_owned();
1226    if let Ok(g) = CACHE.lock() {
1227        if let Some((ref k, ref v)) = *g {
1228            if *k == cwd_key {
1229                return v.clone();
1230            }
1231        }
1232    }
1233
1234    let config = crate::agent::config::load_config();
1235    let discovery =
1236        crate::agent::instructions::discover_agent_skills(&workspace_root, &config.trust);
1237    let output = crate::agent::instructions::render_skill_catalog(&discovery, 6_000)
1238        .map(|rendered| format!("\n\n{}", rendered))
1239        .unwrap_or_default();
1240    if let Ok(mut g) = CACHE.lock() {
1241        *g = Some((cwd_key, output.clone()));
1242    }
1243    output
1244}
1245
1246pub fn extract_think_block(text: &str) -> Option<String> {
1247    // to_ascii_lowercase keeps byte positions identical to the original string.
1248    // to_lowercase can expand some Unicode chars (e.g. İ → i), misaligning offsets.
1249    let lower = text.to_ascii_lowercase();
1250
1251    // Official Gemma-4 Native Tags
1252    let open_tag = "<|channel>thought";
1253    let close_tag = "<channel|>";
1254
1255    let start_pos = lower.find(open_tag)?;
1256    let content_start = start_pos + open_tag.len();
1257
1258    let close_pos = lower[content_start..]
1259        .find(close_tag)
1260        .map(|p| content_start + p)
1261        .unwrap_or(text.len());
1262
1263    let content = text[content_start..close_pos].trim();
1264    if content.is_empty() {
1265        None
1266    } else {
1267        Some(content.to_string())
1268    }
1269}
1270
1271pub fn strip_think_blocks(text: &str) -> String {
1272    // Fast-path: strip a stray </think> the model emits at the start when it skips
1273    // the opening tag (common with Qwen after tool calls). Strip it before the lower
1274    // allocation so it can't slip through any branch below.
1275    let text = {
1276        let t = text.trim_start();
1277        if t.get(..8)
1278            .map(|s| s.eq_ignore_ascii_case("</think>"))
1279            .unwrap_or(false)
1280        {
1281            &t[8..]
1282        } else {
1283            text
1284        }
1285    };
1286
1287    let lower = text.to_ascii_lowercase();
1288
1289    // Use the official Gemma-4 closing tag — answer is everything after it.
1290    if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
1291        let answer = text[end..]
1292            .replace("<|channel>thought", "")
1293            .replace("<channel|>", "");
1294        return answer.trim().replace("\n\n\n", "\n\n").to_string();
1295    }
1296
1297    // No closing tag — if there's an unclosed opening tag, discard everything before and during it.
1298    let first_open = [
1299        lower.find("<|channel>thought"), // Prioritize Gemma-4 native
1300        lower.find("<think>"),
1301        lower.find("<thinking>"),
1302        lower.find("<thought>"),
1303        lower.find("<|think|>"),
1304    ]
1305    .iter()
1306    .filter_map(|&x| x)
1307    .min();
1308
1309    if let Some(start) = first_open {
1310        if start > 0 {
1311            return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
1312        }
1313        return String::new();
1314    }
1315
1316    // If the model outputs 'naked' reasoning without tags:
1317    // Strip leading sentences like "The user asked..." or "I should present..."
1318    // if they appear before actual answer content.
1319    static NAKED_AC: std::sync::OnceLock<aho_corasick::AhoCorasick> = std::sync::OnceLock::new();
1320    let naked_ac = NAKED_AC.get_or_init(|| {
1321        aho_corasick::AhoCorasick::builder()
1322            .ascii_case_insensitive(true)
1323            .build([
1324                "the user asked",
1325                "the user is asking",
1326                "the user wants",
1327                "i will structure",
1328                "i should provide",
1329                "i should give",
1330                "i should avoid",
1331                "i should note",
1332                "i should focus",
1333                "i should keep",
1334                "i should respond",
1335                "i should present",
1336                "i should display",
1337                "i should show",
1338                "i need to",
1339                "i can see from",
1340                "without being overly",
1341                "let me ",
1342                "necessary information in my identity",
1343                "was computed successfully",
1344                "computed successfully",
1345            ])
1346            .expect("valid patterns")
1347    });
1348    let is_naked_reasoning = naked_ac.find(text).is_some();
1349    if is_naked_reasoning {
1350        let lines: Vec<&str> = text.lines().collect();
1351        if !lines.is_empty() {
1352            // Skip leading lines that are themselves reasoning prose or blank.
1353            // Stop skipping at the first line that looks like real answer content.
1354            let mut start_idx = 0;
1355            for (i, line) in lines.iter().enumerate() {
1356                let is_reasoning_line = naked_ac.find(line).is_some() || line.trim().is_empty();
1357                if is_reasoning_line {
1358                    start_idx = i + 1;
1359                } else {
1360                    break;
1361                }
1362            }
1363            if start_idx < lines.len() {
1364                return lines[start_idx..]
1365                    .join("\n")
1366                    .trim()
1367                    .replace("\n\n\n", "\n\n")
1368                    .to_string();
1369            }
1370            // Entire response was reasoning prose — return empty.
1371            return String::new();
1372        }
1373    }
1374
1375    // Strip leaked XML tool-call fragments that Qwen sometimes emits when it
1376    // abandons a tool call mid-generation (e.g. </parameter></function></tool_call>).
1377    let cleaned = strip_xml_tool_call_artifacts(text);
1378    cleaned.trim().replace("\n\n\n", "\n\n").to_string()
1379}
1380
1381/// Remove stray XML tool-call closing/opening tags that local models occasionally
1382/// leak into visible output when they start-then-abandon a tool call.
1383fn strip_xml_tool_call_artifacts(text: &str) -> String {
1384    use aho_corasick::AhoCorasick;
1385    use std::sync::OnceLock;
1386
1387    // Tags to remove (both open and close forms, case-insensitive).
1388    const XML_ARTIFACTS: &[&str] = &[
1389        "</tool_call>",
1390        "<tool_call>",
1391        "</function>",
1392        "<function>",
1393        "</parameter>",
1394        "<parameter>",
1395        "</arguments>",
1396        "<arguments>",
1397        "</tool_use>",
1398        "<tool_use>",
1399        "</invoke>",
1400        "<invoke>",
1401        // Stray think/reasoning closing tags that leak after block extraction.
1402        "</think>",
1403        "<thinking>",
1404        "</thought>",
1405        "</thinking>",
1406        // Gemma-style turn markers that Qwen occasionally mirrors back from the system prompt.
1407        "<|turn>system",
1408        "<|turn>user",
1409        "<|turn>assistant",
1410        "<|turn>tool",
1411        "<turn|>",
1412        "<|think|>",
1413        // ChatML EOS/BOS tokens that can leak at end-of-generation.
1414        "<|im_start|>",
1415        "<|im_end|>",
1416        "<|endoftext|>",
1417    ];
1418
1419    // Build AC automaton once from pre-lowercased patterns; zero-cost on every
1420    // subsequent call.  All patterns are ASCII so byte positions are stable after
1421    // lowercasing (no multi-byte expansion).
1422    static ARTIFACT_AC: OnceLock<AhoCorasick> = OnceLock::new();
1423    let ac = ARTIFACT_AC.get_or_init(|| {
1424        let lowered: Vec<String> = XML_ARTIFACTS.iter().map(|s| s.to_lowercase()).collect();
1425        AhoCorasick::new(&lowered).expect("valid XML artifact patterns")
1426    });
1427
1428    let lower = text.to_ascii_lowercase();
1429
1430    // Fast path: nothing to strip (common case for clean model output).
1431    if ac.find(&lower).is_none() {
1432        return text.to_string();
1433    }
1434
1435    // Collect all match spans in a single left-to-right AC scan, then drain
1436    // in reverse so earlier byte offsets stay valid as we shorten the string.
1437    let spans: Vec<(usize, usize)> = ac.find_iter(&lower).map(|m| (m.start(), m.end())).collect();
1438    let mut out = text.to_string();
1439    for (start, end) in spans.into_iter().rev() {
1440        out.drain(start..end);
1441    }
1442    out
1443}
1444
1445// ── Cached regex accessors for tool-call parsing ─────────────────────────────
1446// Each regex is compiled once via OnceLock; subsequent calls are zero-cost.
1447
1448fn re_gemma_call() -> &'static regex::Regex {
1449    use std::sync::OnceLock;
1450    static RE: OnceLock<regex::Regex> = OnceLock::new();
1451    RE.get_or_init(|| {
1452        regex::Regex::new(r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#)
1453            .expect("valid gemma call regex")
1454    })
1455}
1456fn re_gemma_arg() -> &'static regex::Regex {
1457    use std::sync::OnceLock;
1458    static RE: OnceLock<regex::Regex> = OnceLock::new();
1459    RE.get_or_init(|| {
1460        regex::Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#)
1461            .expect("valid gemma arg regex")
1462    })
1463}
1464fn re_xml_call() -> &'static regex::Regex {
1465    use std::sync::OnceLock;
1466    static RE: OnceLock<regex::Regex> = OnceLock::new();
1467    RE.get_or_init(|| {
1468        regex::Regex::new(r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#)
1469            .expect("valid xml call regex")
1470    })
1471}
1472fn re_xml_param() -> &'static regex::Regex {
1473    use std::sync::OnceLock;
1474    static RE: OnceLock<regex::Regex> = OnceLock::new();
1475    RE.get_or_init(|| {
1476        regex::Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#)
1477            .expect("valid xml param regex")
1478    })
1479}
1480fn re_short_call() -> &'static regex::Regex {
1481    use std::sync::OnceLock;
1482    static RE: OnceLock<regex::Regex> = OnceLock::new();
1483    RE.get_or_init(|| {
1484        regex::Regex::new(r#"(?s)<tool_call>\s*([A-Za-z_][A-Za-z0-9_]*)\((.*?)\)\s*</tool_call>"#)
1485            .expect("valid short call regex")
1486    })
1487}
1488fn re_short_arg() -> &'static regex::Regex {
1489    use std::sync::OnceLock;
1490    static RE: OnceLock<regex::Regex> = OnceLock::new();
1491    RE.get_or_init(|| {
1492        regex::Regex::new(
1493            r#"([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(?:"((?:\\.|[^"])*)"|'((?:\\.|[^'])*)'|([^,\)]+))"#,
1494        )
1495        .expect("valid short arg regex")
1496    })
1497}
1498fn re_strip_gemma_call() -> &'static regex::Regex {
1499    use std::sync::OnceLock;
1500    static RE: OnceLock<regex::Regex> = OnceLock::new();
1501    RE.get_or_init(|| {
1502        regex::Regex::new(r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#)
1503            .expect("valid strip gemma call regex")
1504    })
1505}
1506fn re_strip_xml() -> &'static regex::Regex {
1507    use std::sync::OnceLock;
1508    static RE: OnceLock<regex::Regex> = OnceLock::new();
1509    RE.get_or_init(|| {
1510        regex::Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#)
1511            .expect("valid strip xml regex")
1512    })
1513}
1514fn re_strip_short() -> &'static regex::Regex {
1515    use std::sync::OnceLock;
1516    static RE: OnceLock<regex::Regex> = OnceLock::new();
1517    RE.get_or_init(|| {
1518        regex::Regex::new(r#"(?s)<tool_call>\s*[A-Za-z_][A-Za-z0-9_]*\(.*?\)\s*</tool_call>"#)
1519            .expect("valid strip short regex")
1520    })
1521}
1522fn re_strip_response() -> &'static regex::Regex {
1523    use std::sync::OnceLock;
1524    static RE: OnceLock<regex::Regex> = OnceLock::new();
1525    RE.get_or_init(|| {
1526        regex::Regex::new(
1527            r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#,
1528        )
1529        .expect("valid strip response regex")
1530    })
1531}
1532
1533/// Extract native Gemma-4 <|tool_call|> tags from text.
1534/// Format: <|tool_call|>call:func_name{key:<|"|>value<|"|>, key2:value2}<tool_call|>
1535pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
1536    let mut results = Vec::new();
1537
1538    // -- Format 1: Gemma 4 Native (call:name{args}) --
1539    let re_call = re_gemma_call();
1540    let re_arg = re_gemma_arg();
1541
1542    for cap in re_call.captures_iter(text) {
1543        let name = cap[1].to_string();
1544        let args_str = &cap[2];
1545        let mut arguments = serde_json::Map::new();
1546
1547        for arg_cap in re_arg.captures_iter(args_str) {
1548            let key = arg_cap[1].to_string();
1549            let val_raw = arg_cap
1550                .get(2)
1551                .map(|m| m.as_str())
1552                .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
1553                .unwrap_or("")
1554                .trim();
1555            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
1556
1557            let val = if normalized_raw == "true" {
1558                Value::Bool(true)
1559            } else if normalized_raw == "false" {
1560                Value::Bool(false)
1561            } else if let Ok(n) = normalized_raw.parse::<i64>() {
1562                Value::Number(n.into())
1563            } else if let Ok(n) = normalized_raw.parse::<u64>() {
1564                Value::Number(n.into())
1565            } else if let Ok(n) = normalized_raw.parse::<f64>() {
1566                serde_json::Number::from_f64(n)
1567                    .map(Value::Number)
1568                    .unwrap_or(Value::String(normalized_raw.clone()))
1569            } else {
1570                Value::String(normalized_raw)
1571            };
1572
1573            arguments.insert(key, val);
1574        }
1575
1576        results.push(ToolCallResponse {
1577            id: format!("call_{}", rand::random::<u32>()),
1578            call_type: "function".to_string(),
1579            function: ToolCallFn {
1580                name,
1581                arguments: Value::Object(arguments),
1582            },
1583            index: None,
1584        });
1585    }
1586
1587    // -- Format 2: XML (Qwen/Claude style) --
1588    for cap in re_xml_call().captures_iter(text) {
1589        let name = cap[1].to_string();
1590        let body = &cap[2];
1591        let mut arguments = serde_json::Map::new();
1592
1593        for p_cap in re_xml_param().captures_iter(body) {
1594            let key = p_cap[1].to_string();
1595            let val_raw = p_cap[2].trim();
1596            let val = if val_raw == "true" {
1597                Value::Bool(true)
1598            } else if val_raw == "false" {
1599                Value::Bool(false)
1600            } else if let Ok(n) = val_raw.parse::<i64>() {
1601                Value::Number(n.into())
1602            } else if let Ok(n) = val_raw.parse::<u64>() {
1603                Value::Number(n.into())
1604            } else {
1605                Value::String(val_raw.to_string())
1606            };
1607            arguments.insert(key, val);
1608        }
1609
1610        results.push(ToolCallResponse {
1611            id: format!("call_{}", rand::random::<u32>()),
1612            call_type: "function".to_string(),
1613            function: ToolCallFn {
1614                name,
1615                arguments: Value::Object(arguments),
1616            },
1617            index: None,
1618        });
1619    }
1620
1621    // -- Format 3: shorthand XML wrapper (<tool_call>name(key="value")</tool_call>) --
1622    for cap in re_short_call().captures_iter(text) {
1623        let name = cap[1].to_string();
1624        let args_str = cap[2].trim();
1625        let mut arguments = serde_json::Map::new();
1626
1627        for arg_cap in re_short_arg().captures_iter(args_str) {
1628            let key = arg_cap[1].to_string();
1629            let val_raw = arg_cap
1630                .get(2)
1631                .or_else(|| arg_cap.get(3))
1632                .or_else(|| arg_cap.get(4))
1633                .map(|m| m.as_str())
1634                .unwrap_or("")
1635                .trim();
1636            let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
1637
1638            let val = if normalized_raw == "true" {
1639                Value::Bool(true)
1640            } else if normalized_raw == "false" {
1641                Value::Bool(false)
1642            } else if let Ok(n) = normalized_raw.parse::<i64>() {
1643                Value::Number(n.into())
1644            } else if let Ok(n) = normalized_raw.parse::<u64>() {
1645                Value::Number(n.into())
1646            } else if let Ok(n) = normalized_raw.parse::<f64>() {
1647                serde_json::Number::from_f64(n)
1648                    .map(Value::Number)
1649                    .unwrap_or(Value::String(normalized_raw.clone()))
1650            } else {
1651                Value::String(normalized_raw)
1652            };
1653
1654            arguments.insert(key, val);
1655        }
1656
1657        results.push(ToolCallResponse {
1658            id: format!("call_{}", rand::random::<u32>()),
1659            call_type: "function".to_string(),
1660            function: ToolCallFn {
1661                name,
1662                arguments: Value::Object(arguments),
1663            },
1664            index: None,
1665        });
1666    }
1667
1668    results
1669}
1670
1671pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
1672    let trimmed = raw.trim();
1673    let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
1674
1675    let mut value = match serde_json::from_str::<Value>(&candidate) {
1676        Ok(v) => v,
1677        Err(_) => return candidate,
1678    };
1679    normalize_tool_argument_value(tool_name, &mut value);
1680    value.to_string()
1681}
1682
1683pub fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
1684    match value {
1685        Value::String(s) => *s = normalize_string_arg(s),
1686        Value::Array(items) => {
1687            for item in items {
1688                normalize_tool_argument_value(tool_name, item);
1689            }
1690        }
1691        Value::Object(map) => {
1692            for val in map.values_mut() {
1693                normalize_tool_argument_value(tool_name, val);
1694            }
1695            if tool_name == "grep_files" {
1696                if let Some(Value::String(pattern)) = map.get_mut("pattern") {
1697                    *pattern = normalize_regex_pattern(pattern);
1698                }
1699            }
1700            for key in ["path", "extension", "query", "command", "reason"] {
1701                if let Some(Value::String(s)) = map.get_mut(key) {
1702                    *s = normalize_string_arg(s);
1703                }
1704            }
1705        }
1706        _ => {}
1707    }
1708}
1709
1710fn unwrap_json_string_once(input: &str) -> Option<String> {
1711    if input.len() < 2 {
1712        return None;
1713    }
1714    let first = input.chars().next()?;
1715    let last = input.chars().last()?;
1716    if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
1717        return None;
1718    }
1719    let inner = &input[1..input.len() - 1];
1720    let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
1721    Some(unescaped.trim().to_string())
1722}
1723
1724fn normalize_string_arg(input: &str) -> String {
1725    let mut s = input.trim();
1726    loop {
1727        let len = s.len();
1728        if len < 2 {
1729            break;
1730        }
1731        let first = s.as_bytes()[0];
1732        let last = s.as_bytes()[len - 1];
1733        if (first == b'"' && last == b'"')
1734            || (first == b'\'' && last == b'\'')
1735            || (first == b'`' && last == b'`')
1736        {
1737            s = s[1..len - 1].trim();
1738        } else {
1739            break;
1740        }
1741    }
1742    s.to_string()
1743}
1744
1745fn normalize_regex_pattern(input: &str) -> String {
1746    let out = normalize_string_arg(input);
1747    if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
1748        out[1..out.len() - 1].to_string()
1749    } else {
1750        out
1751    }
1752}
1753
1754fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
1755    let mut system_blocks = Vec::with_capacity(2);
1756    let mut prepared = Vec::with_capacity(messages.len());
1757    let mut seeded = false;
1758
1759    for message in messages {
1760        if message.role == "system" {
1761            let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
1762                .trim()
1763                .to_string();
1764            if !cleaned.is_empty() {
1765                system_blocks.push(cleaned);
1766            }
1767            continue;
1768        }
1769
1770        let mut clone = message.clone();
1771        clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
1772
1773        if !seeded && message.role == "user" {
1774            let content_str = clone.content.as_str();
1775            let mut merged = String::with_capacity(
1776                system_blocks.iter().map(|s| s.len()).sum::<usize>()
1777                    + system_blocks.len().saturating_sub(1) * 2
1778                    + content_str.len()
1779                    + 40,
1780            );
1781            if !system_blocks.is_empty() {
1782                merged.push_str("System instructions for this turn:\n");
1783                merged.push_str(&system_blocks.join("\n\n"));
1784                merged.push_str("\n\n");
1785            }
1786            merged.push_str(content_str);
1787            clone.content = MessageContent::Text(merged);
1788            seeded = true;
1789        }
1790
1791        prepared.push(clone);
1792    }
1793
1794    if !seeded && !system_blocks.is_empty() {
1795        prepared.insert(
1796            0,
1797            ChatMessage::user(&format!(
1798                "System instructions for this turn:\n{}",
1799                system_blocks.join("\n\n")
1800            )),
1801        );
1802    }
1803
1804    prepared
1805}
1806
1807fn strip_legacy_turn_wrappers(text: &str) -> String {
1808    static AC: std::sync::OnceLock<aho_corasick::AhoCorasick> = std::sync::OnceLock::new();
1809    let ac = AC.get_or_init(|| {
1810        aho_corasick::AhoCorasick::new([
1811            "<|turn>system\n",
1812            "<|turn>user\n",
1813            "<|turn>assistant\n",
1814            "<|turn>tool\n",
1815            "<turn|>",
1816        ])
1817        .expect("valid turn wrapper patterns")
1818    });
1819    ac.replace_all(text, &["", "", "", "", ""])
1820        .trim()
1821        .to_string()
1822}
1823
1824pub fn strip_native_tool_call_text(text: &str) -> String {
1825    let without_calls = re_strip_gemma_call().replace_all(text, "");
1826    let without_xml = re_strip_xml().replace_all(without_calls.as_ref(), "");
1827    let without_short = re_strip_short().replace_all(without_xml.as_ref(), "");
1828    re_strip_response()
1829        .replace_all(without_short.as_ref(), "")
1830        .trim()
1831        .to_string()
1832}
1833
1834fn resolve_runtime_context(
1835    previous_model: &str,
1836    previous_context: usize,
1837    effective_model: &str,
1838    detected_context: usize,
1839) -> usize {
1840    if effective_model == "no model loaded" || effective_model.trim().is_empty() {
1841        0
1842    } else if detected_context > 0 {
1843        detected_context
1844    } else if effective_model == previous_model {
1845        previous_context
1846    } else {
1847        0
1848    }
1849}
1850
1851#[cfg(test)]
1852mod tests {
1853    use super::*;
1854    use std::fs;
1855
1856    #[test]
1857    fn system_prompt_includes_running_hematite_version() {
1858        let engine = InferenceEngine::new(
1859            "http://localhost:1234/v1".to_string(),
1860            "strategist".to_string(),
1861            0,
1862        )
1863        .expect("engine");
1864
1865        let system = engine.build_system_prompt(0, 50, false, true, &[], None, None, &[]);
1866        assert!(system.contains(crate::HEMATITE_VERSION));
1867    }
1868
1869    #[test]
1870    fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
1871        let text = r#"<|channel>thought
1872Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
1873<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
1874
1875        let calls = extract_native_tool_calls(text);
1876        assert_eq!(calls.len(), 1);
1877        assert_eq!(calls[0].function.name, "read_file");
1878
1879        let args: Value = calls[0].function.arguments.clone();
1880        assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
1881        assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
1882        assert_eq!(
1883            args.get("path").and_then(|v| v.as_str()),
1884            Some("src/ui/tui.rs")
1885        );
1886
1887        let stripped = strip_native_tool_call_text(text);
1888        assert!(!stripped.contains("<|tool_call"));
1889        assert!(!stripped.contains("<tool_call|>"));
1890    }
1891
1892    #[test]
1893    fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
1894        let text = r#"<|channel>thought
1895Planning.
1896<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
1897Mapped src.
1898<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
1899Read main.
1900<channel|>"#;
1901
1902        let calls = extract_native_tool_calls(text);
1903        assert_eq!(calls.len(), 2);
1904        assert_eq!(calls[0].function.name, "list_files");
1905        assert_eq!(calls[1].function.name, "read_file");
1906
1907        let stripped = strip_native_tool_call_text(text);
1908        assert!(!stripped.contains("<|tool_call"));
1909        assert!(!stripped.contains("<|tool_response"));
1910        assert!(!stripped.contains("<tool_response|>"));
1911    }
1912
1913    #[test]
1914    fn create_directory_is_treated_as_mutating_repo_write() {
1915        let metadata = tool_metadata_for_name("create_directory");
1916        assert!(metadata.mutates_workspace);
1917        assert!(!metadata.read_only_friendly);
1918    }
1919
1920    #[test]
1921    fn extracts_qwen_xml_tool_calls_from_reasoning() {
1922        let text = r#"Based on the project structure, I need to check the binary.
1923<tool_call>
1924<function=shell>
1925<parameter=command>
1926ls -la hematite.exe
1927</parameter>
1928<parameter=reason>
1929Check if the binary exists
1930</parameter>
1931</function>
1932</tool_call>"#;
1933
1934        let calls = extract_native_tool_calls(text);
1935        assert_eq!(calls.len(), 1);
1936        assert_eq!(calls[0].function.name, "shell");
1937
1938        let args: Value = calls[0].function.arguments.clone();
1939        assert_eq!(
1940            args.get("command").and_then(|v| v.as_str()),
1941            Some("ls -la hematite.exe")
1942        );
1943        assert_eq!(
1944            args.get("reason").and_then(|v| v.as_str()),
1945            Some("Check if the binary exists")
1946        );
1947
1948        let stripped = strip_native_tool_call_text(text);
1949        assert!(!stripped.contains("<tool_call>"));
1950        assert!(!stripped.contains("<function=shell>"));
1951    }
1952
1953    #[test]
1954    fn extracts_shorthand_tool_calls_from_reasoning() {
1955        let text = r#"<thinking>
1956The user wants a search first.
1957</thinking>
1958
1959I'll search before continuing.
1960
1961<tool_call>research_web(query="uefn toolbelt python automation unreal engine fortnite")</tool_call>"#;
1962
1963        let calls = extract_native_tool_calls(text);
1964        assert_eq!(calls.len(), 1);
1965        assert_eq!(calls[0].function.name, "research_web");
1966
1967        let args: Value = calls[0].function.arguments.clone();
1968        assert_eq!(
1969            args.get("query").and_then(|v| v.as_str()),
1970            Some("uefn toolbelt python automation unreal engine fortnite")
1971        );
1972
1973        let stripped = strip_native_tool_call_text(text);
1974        assert!(!stripped.contains("<tool_call>"));
1975        assert!(!stripped.contains("research_web(query="));
1976    }
1977
1978    #[test]
1979    fn strips_thinking_tag_as_reasoning_prefix() {
1980        let cleaned =
1981            strip_think_blocks("<thinking>\nThe user wants a search.\n</thinking>\nVisible answer");
1982        assert_eq!(cleaned, "");
1983    }
1984
1985    #[test]
1986    fn resolve_runtime_context_returns_zero_when_no_model_loaded() {
1987        assert_eq!(
1988            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "no model loaded", 0),
1989            0
1990        );
1991    }
1992
1993    #[test]
1994    fn resolve_runtime_context_preserves_previous_only_for_same_model() {
1995        assert_eq!(
1996            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "qwen/qwen3.5-9b", 0),
1997            32000
1998        );
1999        assert_eq!(
2000            resolve_runtime_context("qwen/qwen3.5-9b", 32000, "bonsai-8b", 0),
2001            0
2002        );
2003    }
2004
2005    #[test]
2006    fn load_instruction_files_includes_workspace_guidance_files() {
2007        let _cwd_lock = crate::TEST_CWD_LOCK
2008            .lock()
2009            .unwrap_or_else(|e| e.into_inner());
2010        let temp = tempfile::tempdir().unwrap();
2011        let previous = env!("CARGO_MANIFEST_DIR");
2012
2013        fs::write(
2014            temp.path().join("SKILLS.md"),
2015            "# Workspace Skills\n- Prefer API-first changes before UI polish.",
2016        )
2017        .unwrap();
2018
2019        std::env::set_current_dir(temp.path()).unwrap();
2020        let loaded = load_instruction_files();
2021        std::env::set_current_dir(previous).unwrap();
2022
2023        assert!(loaded.contains("SKILLS.md"));
2024        assert!(loaded.contains("Prefer API-first changes before UI polish."));
2025    }
2026
2027    #[test]
2028    fn load_agent_skill_catalog_includes_skill_directory_entries() {
2029        let _cwd_lock = crate::TEST_CWD_LOCK
2030            .lock()
2031            .unwrap_or_else(|e| e.into_inner());
2032        let temp = tempfile::tempdir().unwrap();
2033        let previous = env!("CARGO_MANIFEST_DIR");
2034
2035        std::fs::create_dir_all(temp.path().join(".agents/skills/code-review")).unwrap();
2036        fs::write(
2037            temp.path().join(".agents/skills/code-review/SKILL.md"),
2038            "---\nname: code-review\ndescription: Review diffs and flag regressions.\ncompatibility: Requires git\n---\n",
2039        )
2040        .unwrap();
2041
2042        std::env::set_current_dir(temp.path()).unwrap();
2043        let loaded = load_agent_skill_catalog();
2044        std::env::set_current_dir(previous).unwrap();
2045
2046        assert!(loaded.contains("Agent Skills Catalog"));
2047        assert!(loaded.contains("code-review"));
2048        assert!(loaded.contains("Review diffs and flag regressions."));
2049    }
2050}
hematite/agent/inference.rs

hematite/agent/
inference.rs