1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7pub struct InferenceEngine {
10 pub client: reqwest::Client,
11 pub api_url: String,
12 pub base_url: String,
15 pub species: String,
16 pub snark: u8,
17 pub kv_semaphore: Semaphore,
18 pub model: std::sync::RwLock<String>,
20 pub context_length: std::sync::atomic::AtomicUsize,
22 pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23 pub worker_model: Option<String>,
25 pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27 pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32 let lower = model.to_ascii_lowercase();
33 lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37 is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44 #[serde(rename = "type")]
45 pub tool_type: String,
46 pub function: ToolFunction,
47 #[serde(skip_serializing, skip_deserializing)]
48 pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53 pub name: String,
54 pub description: String,
55 pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60 RepoRead,
61 RepoWrite,
62 Runtime,
63 Architecture,
64 Toolchain,
65 Verification,
66 Git,
67 Research,
68 Vision,
69 Lsp,
70 Workflow,
71 External,
72 Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77 pub category: ToolCategory,
78 pub mutates_workspace: bool,
79 pub external_surface: bool,
80 pub trust_sensitive: bool,
81 pub read_only_friendly: bool,
82 pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86 if name.starts_with("mcp__") {
87 let lower = name.to_ascii_lowercase();
88 let mutates_workspace = [
89 "__edit",
90 "__write",
91 "__create",
92 "__move",
93 "__delete",
94 "__remove",
95 "__rename",
96 "__replace",
97 "__patch",
98 ]
99 .iter()
100 .any(|needle| lower.contains(needle));
101 return ToolMetadata {
102 category: ToolCategory::External,
103 mutates_workspace,
104 external_surface: true,
105 trust_sensitive: true,
106 read_only_friendly: !mutates_workspace,
107 plan_scope: false,
108 };
109 }
110
111 match name {
112 "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113 category: ToolCategory::RepoRead,
114 mutates_workspace: false,
115 external_surface: false,
116 trust_sensitive: false,
117 read_only_friendly: true,
118 plan_scope: true,
119 },
120 "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121 category: ToolCategory::RepoWrite,
122 mutates_workspace: true,
123 external_surface: false,
124 trust_sensitive: true,
125 read_only_friendly: false,
126 plan_scope: true,
127 },
128 "trace_runtime_flow" => ToolMetadata {
129 category: ToolCategory::Architecture,
130 mutates_workspace: false,
131 external_surface: false,
132 trust_sensitive: false,
133 read_only_friendly: true,
134 plan_scope: false,
135 },
136 "describe_toolchain" => ToolMetadata {
137 category: ToolCategory::Toolchain,
138 mutates_workspace: false,
139 external_surface: false,
140 trust_sensitive: false,
141 read_only_friendly: true,
142 plan_scope: false,
143 },
144 "shell" => ToolMetadata {
145 category: ToolCategory::Runtime,
146 mutates_workspace: true,
147 external_surface: false,
148 trust_sensitive: true,
149 read_only_friendly: false,
150 plan_scope: false,
151 },
152 "inspect_host" => ToolMetadata {
153 category: ToolCategory::Runtime,
154 mutates_workspace: false,
155 external_surface: false,
156 trust_sensitive: false,
157 read_only_friendly: true,
158 plan_scope: false,
159 },
160 "resolve_host_issue" => ToolMetadata {
161 category: ToolCategory::Runtime,
162 mutates_workspace: true,
163 external_surface: true,
164 trust_sensitive: true,
165 read_only_friendly: false,
166 plan_scope: false,
167 },
168 "run_hematite_maintainer_workflow" => ToolMetadata {
169 category: ToolCategory::Workflow,
170 mutates_workspace: true,
171 external_surface: false,
172 trust_sensitive: true,
173 read_only_friendly: false,
174 plan_scope: false,
175 },
176 "run_workspace_workflow" => ToolMetadata {
177 category: ToolCategory::Workflow,
178 mutates_workspace: true,
179 external_surface: false,
180 trust_sensitive: true,
181 read_only_friendly: false,
182 plan_scope: false,
183 },
184 "verify_build" => ToolMetadata {
185 category: ToolCategory::Verification,
186 mutates_workspace: false,
187 external_surface: false,
188 trust_sensitive: false,
189 read_only_friendly: true,
190 plan_scope: false,
191 },
192 "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
193 ToolMetadata {
194 category: ToolCategory::Git,
195 mutates_workspace: true,
196 external_surface: false,
197 trust_sensitive: true,
198 read_only_friendly: false,
199 plan_scope: false,
200 }
201 }
202 "research_web" | "fetch_docs" => ToolMetadata {
203 category: ToolCategory::Research,
204 mutates_workspace: false,
205 external_surface: false,
206 trust_sensitive: false,
207 read_only_friendly: true,
208 plan_scope: false,
209 },
210 "vision_analyze" => ToolMetadata {
211 category: ToolCategory::Vision,
212 mutates_workspace: false,
213 external_surface: false,
214 trust_sensitive: false,
215 read_only_friendly: true,
216 plan_scope: false,
217 },
218 "lsp_definitions"
219 | "lsp_references"
220 | "lsp_hover"
221 | "lsp_rename_symbol"
222 | "lsp_get_diagnostics"
223 | "lsp_search_symbol" => ToolMetadata {
224 category: ToolCategory::Lsp,
225 mutates_workspace: false,
226 external_surface: false,
227 trust_sensitive: false,
228 read_only_friendly: true,
229 plan_scope: false,
230 },
231 "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
232 category: ToolCategory::Workflow,
233 mutates_workspace: false,
234 external_surface: false,
235 trust_sensitive: false,
236 read_only_friendly: true,
237 plan_scope: true,
238 },
239 "manage_tasks" => ToolMetadata {
240 category: ToolCategory::Workflow,
241 mutates_workspace: false,
242 external_surface: false,
243 trust_sensitive: false,
244 read_only_friendly: true,
245 plan_scope: false,
246 },
247 _ => ToolMetadata {
248 category: ToolCategory::Other,
249 mutates_workspace: false,
250 external_surface: false,
251 trust_sensitive: false,
252 read_only_friendly: true,
253 plan_scope: false,
254 },
255 }
256}
257
258#[derive(Serialize, Deserialize, Clone, Debug)]
263pub struct ChatMessage {
264 pub role: String,
265 pub content: MessageContent,
267 #[serde(default, skip_serializing_if = "Vec::is_empty")]
269 pub tool_calls: Vec<ToolCallResponse>,
270 #[serde(skip_serializing_if = "Option::is_none")]
272 pub tool_call_id: Option<String>,
273 #[serde(skip_serializing_if = "Option::is_none")]
275 pub name: Option<String>,
276}
277
278#[derive(Serialize, Deserialize, Clone, Debug)]
279#[serde(untagged)]
280pub enum MessageContent {
281 Text(String),
282 Parts(Vec<ContentPart>),
283}
284
285#[derive(Serialize, Deserialize, Clone, Debug)]
286#[serde(tag = "type")]
287pub enum ContentPart {
288 #[serde(rename = "text")]
289 Text { text: String },
290 #[serde(rename = "image_url")]
291 ImageUrl { image_url: ImageUrlSource },
292}
293
294#[derive(Serialize, Deserialize, Clone, Debug)]
295pub struct ImageUrlSource {
296 pub url: String,
297}
298
299impl Default for MessageContent {
300 fn default() -> Self {
301 MessageContent::Text(String::new())
302 }
303}
304
305impl MessageContent {
306 pub fn as_str(&self) -> &str {
307 match self {
308 MessageContent::Text(s) => s,
309 MessageContent::Parts(parts) => {
310 for part in parts {
311 if let ContentPart::Text { text } = part {
312 return text;
313 }
314 }
315 ""
316 }
317 }
318 }
319}
320
321impl ChatMessage {
322 pub fn system(content: &str) -> Self {
323 Self {
324 role: "system".into(),
325 content: MessageContent::Text(content.into()),
326 tool_calls: Vec::new(),
327 tool_call_id: None,
328 name: None,
329 }
330 }
331 pub fn user(content: &str) -> Self {
332 Self {
333 role: "user".into(),
334 content: MessageContent::Text(content.into()),
335 tool_calls: Vec::new(),
336 tool_call_id: None,
337 name: None,
338 }
339 }
340 pub fn user_with_image(text: &str, image_url: &str) -> Self {
341 let mut text_parts = text.to_string();
342 if !text_parts.contains("<|image|>") {
343 text_parts.push_str(" <|image|>");
344 }
345 Self {
346 role: "user".into(),
347 content: MessageContent::Parts(vec![
348 ContentPart::Text { text: text_parts },
349 ContentPart::ImageUrl {
350 image_url: ImageUrlSource {
351 url: image_url.into(),
352 },
353 },
354 ]),
355 tool_calls: Vec::new(),
356 tool_call_id: None,
357 name: None,
358 }
359 }
360 pub fn assistant_text(content: &str) -> Self {
361 Self {
362 role: "assistant".into(),
363 content: MessageContent::Text(content.into()),
364 tool_calls: Vec::new(),
365 tool_call_id: None,
366 name: None,
367 }
368 }
369 pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
370 Self {
371 role: "assistant".into(),
372 content: MessageContent::Text(content.into()),
373 tool_calls: calls,
374 tool_call_id: None,
375 name: None,
376 }
377 }
378 pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
379 Self::tool_result_for_model(tool_call_id, fn_name, content, "")
380 }
381
382 pub fn tool_result_for_model(
385 tool_call_id: &str,
386 fn_name: &str,
387 content: &str,
388 model: &str,
389 ) -> Self {
390 let body = if is_gemma4_model_name(model) {
391 format!(
392 "<|tool_response>response:{}{}{}<tool_response|>",
393 fn_name, "{", content
394 )
395 } else {
396 content.to_string()
397 };
398 Self {
399 role: "tool".into(),
400 content: MessageContent::Text(body),
401 tool_calls: Vec::new(),
402 tool_call_id: Some(tool_call_id.into()),
403 name: Some(fn_name.into()),
404 }
405 }
406}
407
408#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallResponse {
412 pub id: String,
413 #[serde(rename = "type")]
414 pub call_type: String,
415 pub function: ToolCallFn,
416}
417
418#[derive(Serialize, Deserialize, Clone, Debug)]
419pub struct ToolCallFn {
420 pub name: String,
421 pub arguments: String,
423}
424
425#[derive(Serialize)]
428struct ChatRequest {
429 model: String,
430 messages: Vec<ChatMessage>,
431 temperature: f32,
432 stream: bool,
433 #[serde(skip_serializing_if = "Option::is_none")]
434 tools: Option<Vec<ToolDefinition>>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ChatResponse {
439 choices: Vec<ResponseChoice>,
440 usage: Option<TokenUsage>,
441}
442
443#[derive(Deserialize, Debug, Clone)]
444pub struct TokenUsage {
445 pub prompt_tokens: usize,
446 pub completion_tokens: usize,
447 pub total_tokens: usize,
448 #[serde(default)]
449 pub prompt_cache_hit_tokens: usize,
450 #[serde(default)]
451 pub cache_read_input_tokens: usize,
452}
453
454#[derive(Deserialize, Debug)]
455struct ResponseChoice {
456 message: ResponseMessage,
457 #[serde(default)]
458 finish_reason: Option<String>,
459}
460
461#[derive(Deserialize, Debug)]
462struct ResponseMessage {
463 content: Option<String>,
464 tool_calls: Option<Vec<ToolCallResponse>>,
465 #[serde(default)]
469 reasoning_content: Option<String>,
470}
471
472const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
473const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
474
475fn is_tiny_context_window(context_length: usize) -> bool {
476 context_length <= 8_192
477}
478
479fn is_compact_context_window(context_length: usize) -> bool {
480 context_length > 8_192 && context_length <= 49_152
481}
482
483pub fn is_compact_context_window_pub(context_length: usize) -> bool {
484 is_compact_context_window(context_length)
485}
486
487fn is_provider_context_limit_detail(lower: &str) -> bool {
488 (lower.contains("n_keep") && lower.contains("n_ctx"))
489 || lower.contains("context length")
490 || lower.contains("keep from the initial prompt")
491 || lower.contains("prompt is greater than the context length")
492 || lower.contains("exceeds the context window")
493}
494
495fn classify_runtime_failure_tag(detail: &str) -> &'static str {
496 let lower = detail.to_ascii_lowercase();
497 if lower.contains("context_window_blocked")
498 || lower.contains("context ceiling reached")
499 || lower.contains("exceeds the")
500 || is_provider_context_limit_detail(&lower)
501 {
502 "context_window"
503 } else if lower.contains("empty response from model")
504 || lower.contains("model returned an empty response")
505 {
506 "empty_model_response"
507 } else if lower.contains("action blocked:")
508 || lower.contains("access denied")
509 || lower.contains("declined by user")
510 {
511 "tool_policy_blocked"
512 } else {
513 "provider_degraded"
514 }
515}
516
517fn runtime_failure_guidance(tag: &str) -> &'static str {
518 match tag {
519 "context_window" => {
520 "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
521 }
522 "empty_model_response" => {
523 "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
524 }
525 "tool_policy_blocked" => {
526 "Stay inside the allowed workflow or switch modes before retrying."
527 }
528 _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
529 }
530}
531
532fn format_runtime_failure_message(detail: &str) -> String {
533 let tag = classify_runtime_failure_tag(detail);
534 format!(
535 "[failure:{}] {} Detail: {}",
536 tag,
537 runtime_failure_guidance(tag),
538 detail.trim()
539 )
540}
541
542#[derive(Debug, Clone, Copy, PartialEq, Eq)]
543pub enum ProviderRuntimeState {
544 Booting,
545 Live,
546 Recovering,
547 Degraded,
548 ContextWindow,
549 EmptyResponse,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum McpRuntimeState {
554 Unconfigured,
555 Healthy,
556 Degraded,
557 Failed,
558}
559
560#[derive(Debug, Clone, Copy, PartialEq, Eq)]
561pub enum OperatorCheckpointState {
562 Idle,
563 RecoveringProvider,
564 BudgetReduced,
565 HistoryCompacted,
566 BlockedContextWindow,
567 BlockedPolicy,
568 BlockedRecentFileEvidence,
569 BlockedExactLineWindow,
570 BlockedToolLoop,
571 BlockedVerification,
572}
573
574impl OperatorCheckpointState {
575 pub fn label(self) -> &'static str {
576 match self {
577 OperatorCheckpointState::Idle => "idle",
578 OperatorCheckpointState::RecoveringProvider => "recovering_provider",
579 OperatorCheckpointState::BudgetReduced => "budget_reduced",
580 OperatorCheckpointState::HistoryCompacted => "history_compacted",
581 OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
582 OperatorCheckpointState::BlockedPolicy => "blocked_policy",
583 OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
584 OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
585 OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
586 OperatorCheckpointState::BlockedVerification => "blocked_verification",
587 }
588 }
589}
590
591fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
592 match tag {
593 "context_window" => ProviderRuntimeState::ContextWindow,
594 "empty_model_response" => ProviderRuntimeState::EmptyResponse,
595 _ => ProviderRuntimeState::Degraded,
596 }
597}
598
599fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
600 match tag {
601 "context_window" => {
602 "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
603 .to_string()
604 }
605 "empty_model_response" => {
606 "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
607 .to_string()
608 }
609 "tool_policy_blocked" => {
610 "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
611 .to_string()
612 }
613 _ => {
614 let mut excerpt = detail
615 .split_whitespace()
616 .take(12)
617 .collect::<Vec<_>>()
618 .join(" ");
619 if excerpt.len() > 110 {
620 excerpt.truncate(110);
621 excerpt.push_str("...");
622 }
623 if excerpt.is_empty() {
624 "LM Studio degraded; Hematite will retry once before surfacing a failure."
625 .to_string()
626 } else {
627 format!("LM Studio degraded: {}", excerpt)
628 }
629 }
630 }
631}
632
633#[derive(Debug)]
636pub enum InferenceEvent {
637 Token(String),
639 MutedToken(String),
641 Thought(String),
643 VoiceStatus(String),
645 ToolCallStart {
647 id: String,
648 name: String,
649 args: String,
650 },
651 ToolCallResult {
653 id: String,
654 name: String,
655 output: String,
656 is_error: bool,
657 },
658 ApprovalRequired {
662 id: String,
663 name: String,
664 display: String,
665 diff: Option<String>,
668 responder: tokio::sync::oneshot::Sender<bool>,
669 },
670 Done,
672 Error(String),
674 ProviderStatus {
676 state: ProviderRuntimeState,
677 summary: String,
678 },
679 OperatorCheckpoint {
681 state: OperatorCheckpointState,
682 summary: String,
683 },
684 RecoveryRecipe { summary: String },
686 McpStatus {
688 state: McpRuntimeState,
689 summary: String,
690 },
691 CompactionPressure {
693 estimated_tokens: usize,
694 threshold_tokens: usize,
695 percent: u8,
696 },
697 PromptPressure {
699 estimated_input_tokens: usize,
700 reserved_output_tokens: usize,
701 estimated_total_tokens: usize,
702 context_length: usize,
703 percent: u8,
704 },
705 TaskProgress {
707 id: String,
708 label: String,
709 progress: u8,
710 },
711 UsageUpdate(TokenUsage),
713 RuntimeProfile {
715 model_id: String,
716 context_length: usize,
717 },
718 VeinStatus {
720 file_count: usize,
721 embedded_count: usize,
722 docs_only: bool,
723 },
724 VeinContext { paths: Vec<String> },
727 SoulReroll {
729 species: String,
730 rarity: String,
731 shiny: bool,
732 personality: String,
733 },
734 EmbedProfile { model_id: Option<String> },
736 ShellLine(String),
740}
741
742impl InferenceEngine {
745 pub fn new(
746 api_url: String,
747 species: String,
748 snark: u8,
749 ) -> Result<Self, Box<dyn std::error::Error>> {
750 let client = reqwest::Client::builder()
751 .timeout(std::time::Duration::from_secs(180))
752 .build()?;
753
754 let base_url = {
756 let trimmed = api_url.trim_end_matches('/');
757 if let Some(scheme_end) = trimmed.find("://") {
758 let after_scheme = &trimmed[scheme_end + 3..];
759 if let Some(path_start) = after_scheme.find('/') {
760 format!(
761 "{}://{}",
762 &trimmed[..scheme_end],
763 &after_scheme[..path_start]
764 )
765 } else {
766 trimmed.to_string()
767 }
768 } else {
769 trimmed.to_string()
770 }
771 };
772
773 let api_url = if api_url.ends_with("/chat/completions") {
774 api_url
775 } else if api_url.ends_with("/") {
776 format!("{}chat/completions", api_url)
777 } else {
778 format!("{}/chat/completions", api_url)
779 };
780
781 Ok(Self {
782 client,
783 api_url,
784 base_url,
785 species,
786 snark,
787 kv_semaphore: Semaphore::new(3),
788 model: std::sync::RwLock::new(String::new()),
789 context_length: std::sync::atomic::AtomicUsize::new(32_768), economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
791 worker_model: None,
792 gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
793 cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
794 })
795 }
796
797 pub fn set_gemma_native_formatting(&self, enabled: bool) {
798 self.gemma_native_formatting
799 .store(enabled, std::sync::atomic::Ordering::SeqCst);
800 }
801
802 pub fn gemma_native_formatting_enabled(&self) -> bool {
803 self.gemma_native_formatting
804 .load(std::sync::atomic::Ordering::SeqCst)
805 }
806
807 pub fn current_model(&self) -> String {
808 self.model.read().map(|g| g.clone()).unwrap_or_default()
809 }
810
811 pub fn current_context_length(&self) -> usize {
812 self.context_length
813 .load(std::sync::atomic::Ordering::SeqCst)
814 }
815
816 pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
817 if let Ok(mut guard) = self.model.write() {
818 *guard = model.to_string();
819 }
820 self.context_length
821 .store(context_length, std::sync::atomic::Ordering::SeqCst);
822 }
823
824 pub async fn health_check(&self) -> bool {
826 let url = format!("{}/v1/models", self.base_url);
827 match self.client.get(&url).send().await {
828 Ok(resp) => resp.status().is_success(),
829 Err(_) => false,
830 }
831 }
832
833 pub async fn get_loaded_model(&self) -> Option<String> {
841 #[derive(Deserialize)]
842 struct ModelList {
843 data: Vec<ModelEntry>,
844 }
845 #[derive(Deserialize)]
846 struct ModelEntry {
847 id: String,
848 #[serde(rename = "type", default)]
849 model_type: String,
850 #[serde(default)]
851 state: String,
852 }
853
854 if let Ok(resp) = self
856 .client
857 .get(format!("{}/api/v0/models", self.base_url))
858 .send()
859 .await
860 {
861 if let Ok(list) = resp.json::<ModelList>().await {
862 let chat_model = list
863 .data
864 .into_iter()
865 .find(|m| m.model_type != "embeddings" && m.state == "loaded")
866 .map(|m| m.id)
867 .unwrap_or_default();
868 return Some(chat_model);
869 }
870 }
871
872 let resp = self
874 .client
875 .get(format!("{}/v1/models", self.base_url))
876 .send()
877 .await
878 .ok()?;
879 let list: ModelList = resp.json().await.ok()?;
880 Some(
881 list.data
882 .into_iter()
883 .find(|m| !m.id.to_lowercase().contains("embed"))
884 .map(|m| m.id)
885 .unwrap_or_default(),
886 )
887 }
888
889 pub async fn get_embedding_model(&self) -> Option<String> {
895 #[derive(Deserialize)]
896 struct ModelList {
897 data: Vec<ModelEntry>,
898 }
899 #[derive(Deserialize)]
900 struct ModelEntry {
901 id: String,
902 #[serde(rename = "type", default)]
903 model_type: String,
904 #[serde(default)]
905 state: String,
906 }
907 let resp = self
908 .client
909 .get(format!("{}/api/v0/models", self.base_url))
910 .send()
911 .await
912 .ok()?;
913 let list: ModelList = resp.json().await.ok()?;
914 list.data
915 .into_iter()
916 .find(|m| m.model_type == "embeddings" && m.state == "loaded")
917 .map(|m| m.id)
918 }
919
920 pub async fn detect_context_length(&self) -> usize {
926 #[derive(Deserialize)]
927 struct LmStudioModel {
928 id: Option<String>,
929 #[serde(rename = "type", default)]
930 model_type: String,
931 state: Option<String>,
932 loaded_context_length: Option<u64>,
933 context_length: Option<u64>,
934 max_context_length: Option<u64>,
935 }
936 #[derive(Deserialize)]
937 struct LmStudioList {
938 data: Vec<LmStudioModel>,
939 }
940
941 if let Ok(resp) = self
943 .client
944 .get(format!("{}/api/v0/models", self.base_url))
945 .send()
946 .await
947 {
948 if let Ok(list) = resp.json::<LmStudioList>().await {
949 let target_model = self.current_model().to_ascii_lowercase();
950 let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
952 let loaded = list
953 .data
954 .iter()
955 .find(|m| {
956 non_embed(m)
957 && m.state.as_deref() == Some("loaded")
958 && m.id
959 .as_deref()
960 .map(|id| id.eq_ignore_ascii_case(&target_model))
961 .unwrap_or(false)
962 })
963 .or_else(|| {
964 list.data
965 .iter()
966 .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
967 })
968 .or_else(|| {
969 list.data.iter().find(|m| {
970 non_embed(m)
971 && m.id
972 .as_deref()
973 .map(|id| id.eq_ignore_ascii_case(&target_model))
974 .unwrap_or(false)
975 })
976 })
977 .or_else(|| list.data.iter().find(|m| non_embed(m)));
978
979 if let Some(model) = loaded {
980 if let Some(ctx) = model.loaded_context_length {
981 if ctx > 0 {
982 return ctx as usize;
983 }
984 }
985 if let Some(ctx) = model.context_length {
986 if ctx > 0 {
987 return ctx as usize;
988 }
989 }
990 if let Some(ctx) = model.max_context_length {
991 if ctx > 0 && ctx <= 32_768 {
992 return ctx as usize;
993 }
994 }
995 }
996 }
997 }
998
999 if self.current_model().to_lowercase().contains("gemma-4") {
1003 return 32_768;
1004 }
1005
1006 32_768
1007 }
1008
1009 pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
1010 let previous_model = self.current_model();
1011 let previous_context = self.current_context_length();
1012
1013 let detected_model = match self.get_loaded_model().await {
1014 Some(m) if !m.is_empty() => m, Some(_) => "no model loaded".to_string(), None => previous_model.clone(), };
1018
1019 if !detected_model.is_empty() && detected_model != previous_model {
1020 if let Ok(mut guard) = self.model.write() {
1021 *guard = detected_model.clone();
1022 }
1023 }
1024
1025 let detected_context = self.detect_context_length().await;
1026 let effective_model = if detected_model.is_empty() {
1027 previous_model.clone()
1028 } else {
1029 detected_model
1030 };
1031
1032 let changed = effective_model != previous_model || detected_context != previous_context;
1033 self.set_runtime_profile(&effective_model, detected_context);
1034
1035 Some((effective_model, detected_context, changed))
1036 }
1037
1038 pub fn build_system_prompt(
1039 &self,
1040 snark: u8,
1041 chaos: u8,
1042 brief: bool,
1043 professional: bool,
1044 tools: &[ToolDefinition],
1045 reasoning_history: Option<&str>,
1046 mcp_tools: &[crate::agent::mcp::McpTool],
1047 ) -> String {
1048 let mut sys = self.build_system_prompt_legacy(
1049 snark,
1050 chaos,
1051 brief,
1052 professional,
1053 tools,
1054 reasoning_history,
1055 );
1056
1057 if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1058 sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1059 sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1060 for tool in mcp_tools {
1061 let description = tool
1062 .description
1063 .as_deref()
1064 .unwrap_or("No description provided.");
1065 sys.push_str(&format!("- {}: {}\n", tool.name, description));
1066 }
1067 }
1068
1069 sys
1070 }
1071
1072 pub fn build_system_prompt_legacy(
1073 &self,
1074 snark: u8,
1075 _chaos: u8,
1076 brief: bool,
1077 professional: bool,
1078 tools: &[ToolDefinition],
1079 reasoning_history: Option<&str>,
1080 ) -> String {
1081 let current_context_length = self.current_context_length();
1082 if is_tiny_context_window(current_context_length) {
1083 return self.build_system_prompt_tiny(brief, professional);
1084 }
1085 if is_compact_context_window(current_context_length) {
1086 return self.build_system_prompt_compact(brief, professional, tools);
1087 }
1088
1089 let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1091 - You are Hematite, a local coding system working on the user's machine.\n\
1092 - The running Hematite build is ");
1093 sys.push_str(&crate::hematite_version_display());
1094 sys.push_str(".\n\
1095 - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1096 - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1097 - For simple questions, answer briefly in plain language.\n\
1098 - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1099 - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1100 - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1101 - Keep internal reasoning inside channel delimiters.\n\
1102 - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1103 <turn|>\n\n");
1104
1105 if let Some(history) = reasoning_history {
1106 if !history.is_empty() {
1107 sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1108 sys.push_str(history);
1109 sys.push_str("\n\n");
1110 }
1111 }
1112
1113 if brief {
1115 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1116 - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1117 - Depth: Surface-level verification only.\n\n");
1118 } else {
1119 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1120 - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1121 - Depth: Full multi-step derivation required.\n\n");
1122 }
1123
1124 let os = std::env::consts::OS;
1126 if professional {
1127 sys.push_str(&format!(
1128 "You are Hematite, a local coding system running on {}. \
1129 The TUI is one interface layer, not your whole identity. \
1130 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1131 Skip filler and keep the focus on the work.\n",
1132 os
1133 ));
1134 } else {
1135 sys.push_str(&format!(
1136 "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1137 The terminal UI is only one surface of the system. \
1138 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1139 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1140 self.species, snark, os
1141 ));
1142 }
1143
1144 let current_model = self.current_model();
1146 if !current_model.is_empty() {
1147 sys.push_str(&format!(
1148 "Loaded model: {} | Context window: {} tokens. \
1149 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1150 current_model, current_context_length
1151 ));
1152 if is_gemma4_model_name(¤t_model) {
1153 sys.push_str(
1154 "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1155 Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1156 For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1157 );
1158 }
1159 } else {
1160 sys.push_str(&format!(
1161 "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1162 current_context_length
1163 ));
1164 }
1165
1166 let shell_desc = if cfg!(target_os = "windows") {
1168 "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1169 - Use ONLY for builds, tests, or file migrations. \n\
1170 - You MUST use the `powershell` tool directly. \n\
1171 - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1172 } else {
1173 "[EXTERNAL SHELL]: `bash` (Unix).\n\
1174 - Use ONLY for builds, tests, or file migrations. \n\
1175 - NEVER wrap bash in other shells. \n\n"
1176 };
1177
1178 sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1179 [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1180 - These are the ONLY way to explore and modify code. \n\
1181 - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1182 sys.push_str(shell_desc);
1183
1184 sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1186 SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1187
1188 if brief {
1189 sys.push_str(
1190 "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1191 );
1192 }
1193
1194 sys.push_str("## CORE DIRECTIVES\n\
1195 1. REASONING: Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1196 2. CONCISENESS: After <think>, output ONE concise technical sentence or code block. Nothing else.\n\
1197 3. GROUNDEDNESS: Never invent tools, channels, or files. If a detail is not verified from tool output, say `uncertain`. Answer from stable Hematite capabilities unless repo implementation is requested.\n\n\
1198 ## ARCHITECTURAL DISCIPLINE\n\
1199 - HOST INSPECTION PRIORITY: MANDATORY. For all diagnostic questions (load, CPU/RAM, processes, toolchains, network, ports, OS config, log-checks), prefer `inspect_host` over raw `shell`. If `env_doctor` answers, do not follow with `path` unless requested.\n\
1200 - WORKFLOW PRIORITY: Prefer `run_workspace_workflow` for project builds/tests and `run_hematite_maintainer_workflow` for app-level scripts over raw `shell`.\n\
1201 - PROOF BEFORE ACTION: `read_file` or `inspect_lines` before editing. For files >200 lines, `grep_files` for a pattern BEFORE reading.\n\
1202 - PROOF BEFORE COMMIT: Do not `git_commit` until a successful `verify_build` exists for the latest changes.\n\
1203 - BUILT-IN FIRST: Prefer internal file tools over `mcp__filesystem__*` unless MCP is explicitly required.\n\n\
1204 ## TECHNIQUE & SAFETY\n\
1205 - SHELL DISCIPLINE: Risky `shell` calls REQUIRE a `reason` argument. Always use `powershell` on Windows; never `bash` or `/dev/null`.\n\
1206 - EDIT & TOOL PRECISION: Use unique lines/anchors for `edit_file`. Do not call tools like 'think' or 'reasoning'. Paths and symbols are NOT tool names.\n\
1207 - CONTEXT AWARENESS: Answer at the harness level (file ops, shell, build). Prefer real language examples (Python, C#, TS, Go). Never mention `mcp__*` tools unless active and relevant.\n\
1208 - TOOLING DISCIPLINE: Prefer `describe_toolchain` over improvising tool surface from memory; preserve its identifiers exactly.");
1209
1210 sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1212 2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1213 3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1214 4. Fix all errors before declaring success.\n\n\
1215 ## PRE-FLIGHT SCOPING PROTOCOL\n\
1216 Before attempting any multi-file task or complex refactor:\n\
1217 1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1218 2. Use `auto_pin_context` to keep those files in active context.\n\
1219 3. Only then proceed to deeper edits or research.\n\n\
1220 ## REFACTORING PROTOCOL\n\
1221 When modifying existing code or renaming symbols:\n\
1222 1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1223 2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1224 3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1225
1226 sys.push_str(&load_instruction_files());
1228
1229 sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1231
1232 if !tools.is_empty() {
1234 sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1235 for tool in tools {
1236 let schema = serde_json::to_string(&tool.function.parameters)
1237 .unwrap_or_else(|_| "{}".to_string());
1238 sys.push_str(&format!(
1239 "<|tool>declaration:{}{}{}<tool|>\n",
1240 tool.function.name, "{", schema
1241 ));
1242 sys.push_str(&format!("// {})\n", tool.function.description));
1243 }
1244 }
1245
1246 sys
1247 }
1248
1249 fn build_system_prompt_compact(
1250 &self,
1251 brief: bool,
1252 professional: bool,
1253 tools: &[ToolDefinition],
1254 ) -> String {
1255 let current_model = self.current_model();
1258 let current_context_length = self.current_context_length();
1259 let os = std::env::consts::OS;
1260
1261 let mut sys = String::from("<|turn>system\n<|think|>\n");
1262 sys.push_str(&format!(
1263 "You are Hematite {}, a local coding harness working on the user's machine.\n",
1264 crate::hematite_version_display()
1265 ));
1266 if professional {
1267 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1268 } else {
1269 sys.push_str(&format!(
1270 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1271 self.species
1272 ));
1273 }
1274 sys.push_str(&format!(
1275 "Model: {} | Context: {} tokens. Keep turns focused.\n",
1276 current_model, current_context_length
1277 ));
1278 if is_gemma4_model_name(¤t_model) {
1279 sys.push_str(
1280 "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1281 Raw regex patterns in grep_files, no slash delimiters.\n",
1282 );
1283 }
1284 if cfg!(target_os = "windows") {
1285 sys.push_str(&format!(
1286 "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1287 os
1288 ));
1289 } else {
1290 sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1291 }
1292 if brief {
1293 sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1294 }
1295
1296 sys.push_str(
1297 "\nCORE RULES:\n\
1298 - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1299 - Verify after edits: run `verify_build` after code changes, before committing.\n\
1300 - One tool at a time. Do not batch unrelated tool calls.\n\
1301 - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1302 - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1303 - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1304 );
1305
1306 if !tools.is_empty() {
1307 sys.push_str("\n# AVAILABLE TOOLS\n");
1308 for tool in tools {
1309 let desc: String = tool.function.description.chars().take(120).collect();
1310 sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1311 }
1312 }
1313
1314 sys.push_str("<turn|>\n");
1315 sys
1316 }
1317
1318 fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1319 let current_model = self.current_model();
1320 let current_context_length = self.current_context_length();
1321 let os = std::env::consts::OS;
1322 let mut sys = format!(
1323 "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1324 crate::hematite_version_display()
1325 );
1326 if professional {
1327 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1328 } else {
1329 sys.push_str(&format!(
1330 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1331 self.species
1332 ));
1333 }
1334 if !current_model.is_empty() {
1335 sys.push_str(&format!(
1336 "Loaded model: {} | Context window: {} tokens.\n",
1337 current_model, current_context_length
1338 ));
1339 } else {
1340 sys.push_str(&format!(
1341 "Context window: {} tokens.\n",
1342 current_context_length
1343 ));
1344 }
1345 sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1346 sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1347 sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1348 if cfg!(target_os = "windows") {
1349 sys.push_str(&format!(
1350 "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1351 os
1352 ));
1353 } else {
1354 sys.push_str(&format!(
1355 "You are running on {}. Use the native Unix shell conventions.\n",
1356 os
1357 ));
1358 }
1359 if brief {
1360 sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1361 }
1362 if is_gemma4_model_name(¤t_model) {
1363 sys.push_str(
1364 "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1365 );
1366 }
1367 sys.push_str("<turn|>\n");
1368 sys
1369 }
1370
1371 pub async fn call_with_tools(
1376 &self,
1377 messages: &[ChatMessage],
1378 tools: &[ToolDefinition],
1379 model_override: Option<&str>,
1381 ) -> Result<
1382 (
1383 Option<String>,
1384 Option<Vec<ToolCallResponse>>,
1385 Option<TokenUsage>,
1386 Option<String>,
1387 ),
1388 String,
1389 > {
1390 let _permit = self
1391 .kv_semaphore
1392 .acquire()
1393 .await
1394 .map_err(|e| e.to_string())?;
1395
1396 let current_model = self.current_model();
1397 let model = model_override.unwrap_or(current_model.as_str()).to_string();
1398 let filtered_tools = if cfg!(target_os = "windows") {
1399 tools
1400 .iter()
1401 .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1402 .cloned()
1403 .collect::<Vec<_>>()
1404 } else {
1405 tools.to_vec()
1406 };
1407
1408 let request_messages = if should_use_gemma_native_formatting(self, &model) {
1409 prepare_gemma_native_messages(messages)
1410 } else {
1411 messages.to_vec()
1412 };
1413
1414 const COMPACT_CORE_TOOLS: &[&str] = &[
1419 "read_file",
1420 "inspect_lines",
1421 "edit_file",
1422 "write_file",
1423 "grep_files",
1424 "list_files",
1425 "verify_build",
1426 "shell",
1427 ];
1428 let effective_tools = if is_compact_context_window(self.current_context_length()) {
1429 let core: Vec<_> = filtered_tools
1430 .iter()
1431 .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1432 .cloned()
1433 .collect();
1434 if core.is_empty() {
1435 None
1436 } else {
1437 Some(core)
1438 }
1439 } else if filtered_tools.is_empty() {
1440 None
1441 } else {
1442 Some(filtered_tools)
1443 };
1444
1445 let request = ChatRequest {
1446 model: model.clone(),
1447 messages: request_messages,
1448 temperature: 0.2,
1449 stream: false,
1450 tools: effective_tools,
1451 };
1452
1453 preflight_chat_request(
1455 &model,
1456 &request.messages,
1457 request.tools.as_deref().unwrap_or(&[]),
1458 self.current_context_length(),
1459 )?;
1460
1461 let mut last_err = String::new();
1462 let mut response_opt: Option<reqwest::Response> = None;
1463 for attempt in 0..3u32 {
1464 match self.client.post(&self.api_url).json(&request).send().await {
1465 Ok(res) if res.status().is_success() => {
1466 response_opt = Some(res);
1467 break;
1468 }
1469 Ok(res) if res.status().as_u16() >= 500 => {
1470 last_err = format!("LM Studio error {}", res.status());
1471 }
1472 Ok(res) => {
1473 let status = res.status();
1475 let body = res.text().await.unwrap_or_default();
1476 let preview = &body[..body.len().min(300)];
1477 return Err(format!("LM Studio error {}: {}", status, preview));
1478 }
1479 Err(e) if e.is_timeout() || e.is_connect() => {
1480 last_err = format!("Request failed: {}", e);
1481 }
1482 Err(e) => return Err(format!("Request failed: {}", e)),
1483 }
1484 if attempt < 2 {
1485 let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1486 tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1487 }
1488 }
1489 let res = response_opt
1490 .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1491
1492 let body: ChatResponse = res
1493 .json()
1494 .await
1495 .map_err(|e| format!("Response parse error: {}", e))?;
1496
1497 if let Some(usage) = &body.usage {
1498 let mut econ = self.economics.lock().unwrap();
1499 econ.input_tokens += usage.prompt_tokens;
1500 econ.output_tokens += usage.completion_tokens;
1501 }
1502
1503 let choice = body
1504 .choices
1505 .into_iter()
1506 .next()
1507 .ok_or_else(|| "Empty response from model".to_string())?;
1508
1509 let finish_reason = choice.finish_reason;
1510 let mut tool_calls = choice.message.tool_calls;
1511 let mut content = choice.message.content;
1512
1513 if let Some(raw_content) = &content {
1516 let native_calls = extract_native_tool_calls(raw_content);
1517 if !native_calls.is_empty() {
1518 let mut existing = tool_calls.unwrap_or_default();
1519 existing.extend(native_calls);
1520 tool_calls = Some(existing);
1521 let stripped = strip_native_tool_call_text(raw_content);
1522 content = if stripped.trim().is_empty() {
1523 None
1524 } else {
1525 Some(stripped)
1526 };
1527 }
1528 }
1529
1530 if is_gemma4_model_name(&model) {
1531 if let Some(calls) = tool_calls.as_mut() {
1532 for call in calls.iter_mut() {
1533 call.function.arguments = normalize_tool_argument_string(
1534 &call.function.name,
1535 &call.function.arguments,
1536 );
1537 }
1538 }
1539 }
1540
1541 let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1546 if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1547 && content
1548 .as_ref()
1549 .map(|s| s.trim().is_empty())
1550 .unwrap_or(true)
1551 && !reasoning_text.is_empty()
1552 {
1553 let recovered = extract_native_tool_calls(&reasoning_text);
1554 if !recovered.is_empty() {
1555 tool_calls = Some(recovered);
1556 content = None;
1558 } else if finish_reason.as_deref() == Some("stop") {
1559 content = Some(reasoning_text);
1565 }
1566 }
1567
1568 Ok((content, tool_calls, body.usage, finish_reason))
1569 }
1570
1571 pub async fn stream_messages(
1575 &self,
1576 messages: &[ChatMessage],
1577 tx: mpsc::Sender<InferenceEvent>,
1578 ) -> Result<(), Box<dyn std::error::Error>> {
1579 let current_model = self.current_model();
1580 let request_messages = if should_use_gemma_native_formatting(self, ¤t_model) {
1581 prepare_gemma_native_messages(messages)
1582 } else {
1583 messages
1584 .iter()
1585 .map(|m| {
1586 let mut clone = m.clone();
1587 let current_text = m.content.as_str();
1588 if !current_text.starts_with("<|turn>") {
1589 clone.content = MessageContent::Text(format!(
1590 "<|turn>{}\n{}\n<turn|>",
1591 m.role, current_text
1592 ));
1593 }
1594 clone
1595 })
1596 .collect()
1597 };
1598
1599 let request = ChatRequest {
1600 model: current_model.clone(),
1601 messages: request_messages,
1602 temperature: 0.7,
1603 stream: true,
1604 tools: None,
1605 };
1606
1607 if let Err(e) = preflight_chat_request(
1608 ¤t_model,
1609 &request.messages,
1610 &[],
1611 self.current_context_length(),
1612 ) {
1613 let tag = classify_runtime_failure_tag(&e);
1614 let _ = tx
1615 .send(InferenceEvent::ProviderStatus {
1616 state: provider_state_for_failure_tag(tag),
1617 summary: compact_runtime_failure_summary(tag, &e),
1618 })
1619 .await;
1620 let _ = tx
1621 .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1622 .await;
1623 let _ = tx.send(InferenceEvent::Done).await;
1624 return Ok(());
1625 }
1626
1627 let mut last_err = String::new();
1628 let mut response_opt: Option<reqwest::Response> = None;
1629 for attempt in 0..2u32 {
1630 match self.client.post(&self.api_url).json(&request).send().await {
1631 Ok(res) if res.status().is_success() => {
1632 response_opt = Some(res);
1633 break;
1634 }
1635 Ok(res) if res.status().as_u16() >= 500 => {
1636 last_err = format!("LM Studio error {}", res.status());
1637 }
1638 Ok(res) => {
1639 let status = res.status();
1640 let body = res.text().await.unwrap_or_default();
1641 let preview = &body[..body.len().min(300)];
1642 let detail = format!("LM Studio error {}: {}", status, preview);
1643 let tag = classify_runtime_failure_tag(&detail);
1644 let _ = tx
1645 .send(InferenceEvent::ProviderStatus {
1646 state: provider_state_for_failure_tag(tag),
1647 summary: compact_runtime_failure_summary(tag, &detail),
1648 })
1649 .await;
1650 let _ = tx
1651 .send(InferenceEvent::Error(format_runtime_failure_message(
1652 &detail,
1653 )))
1654 .await;
1655 let _ = tx.send(InferenceEvent::Done).await;
1656 return Ok(());
1657 }
1658 Err(e) if e.is_timeout() || e.is_connect() => {
1659 last_err = format!("Request failed: {}", e);
1660 }
1661 Err(e) => {
1662 let detail = format!("Request failed: {}", e);
1663 let tag = classify_runtime_failure_tag(&detail);
1664 let _ = tx
1665 .send(InferenceEvent::ProviderStatus {
1666 state: provider_state_for_failure_tag(tag),
1667 summary: compact_runtime_failure_summary(tag, &detail),
1668 })
1669 .await;
1670 let _ = tx
1671 .send(InferenceEvent::Error(format_runtime_failure_message(
1672 &detail,
1673 )))
1674 .await;
1675 let _ = tx.send(InferenceEvent::Done).await;
1676 return Ok(());
1677 }
1678 }
1679 if attempt < 1 {
1680 let _ = tx
1681 .send(InferenceEvent::ProviderStatus {
1682 state: ProviderRuntimeState::Recovering,
1683 summary: "LM Studio degraded during stream startup; retrying once.".into(),
1684 })
1685 .await;
1686 tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1687 }
1688 }
1689 let Some(res) = response_opt else {
1690 let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1691 let tag = classify_runtime_failure_tag(&detail);
1692 let _ = tx
1693 .send(InferenceEvent::ProviderStatus {
1694 state: provider_state_for_failure_tag(tag),
1695 summary: compact_runtime_failure_summary(tag, &detail),
1696 })
1697 .await;
1698 let _ = tx
1699 .send(InferenceEvent::Error(format_runtime_failure_message(
1700 &detail,
1701 )))
1702 .await;
1703 let _ = tx.send(InferenceEvent::Done).await;
1704 return Ok(());
1705 };
1706
1707 use futures::StreamExt;
1708 let mut byte_stream = res.bytes_stream();
1709
1710 let mut line_buffer = String::new();
1713 let mut content_buffer = String::new();
1714 let mut past_think = false;
1715 let mut emitted_any_content = false;
1716 let mut emitted_live_status = false;
1717
1718 loop {
1721 let next = tokio::select! {
1722 chunk = byte_stream.next() => chunk,
1724 _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1725 if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1726 break;
1727 }
1728 continue;
1729 }
1730 };
1731
1732 let Some(item) = next else { break };
1733
1734 let chunk = match item {
1735 Ok(chunk) => chunk,
1736 Err(e) => {
1737 let detail = format!("Request failed: {}", e);
1738 let tag = classify_runtime_failure_tag(&detail);
1739 let _ = tx
1740 .send(InferenceEvent::ProviderStatus {
1741 state: provider_state_for_failure_tag(tag),
1742 summary: compact_runtime_failure_summary(tag, &detail),
1743 })
1744 .await;
1745 let _ = tx
1746 .send(InferenceEvent::Error(format_runtime_failure_message(
1747 &detail,
1748 )))
1749 .await;
1750 let _ = tx.send(InferenceEvent::Done).await;
1751 return Ok(());
1752 }
1753 };
1754 line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1755
1756 while let Some(pos) = line_buffer.find("\n\n") {
1757 let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1758 let data_pos = match event_str.find("data: ") {
1759 Some(p) => p,
1760 None => continue,
1761 };
1762
1763 let data = event_str[data_pos + 6..].trim();
1764 if data == "[DONE]" {
1765 break;
1766 }
1767
1768 if let Ok(json) = serde_json::from_str::<Value>(data) {
1769 let delta = &json["choices"][0]["delta"];
1770
1771 if let Some(reasoning) = delta["reasoning_content"]
1773 .as_str()
1774 .or_else(|| delta["thought"].as_str())
1775 {
1776 if !reasoning.is_empty() {
1777 past_think = false; content_buffer.push_str(reasoning);
1779 if content_buffer.len() > 30
1780 && (reasoning.contains('\n') || reasoning.contains('.'))
1781 {
1782 let _ = tx
1783 .send(InferenceEvent::Thought(content_buffer.clone()))
1784 .await;
1785 emitted_any_content = true;
1786 content_buffer.clear();
1787 }
1788 }
1789 }
1790
1791 if let Some(content) = delta["content"].as_str() {
1793 if content.is_empty() {
1794 continue;
1795 }
1796
1797 if !past_think && !content_buffer.is_empty() && !content.trim().is_empty() {
1800 let _ = tx
1803 .send(InferenceEvent::Thought(content_buffer.clone()))
1804 .await;
1805 content_buffer.clear();
1806 past_think = true;
1807 }
1808
1809 if !past_think {
1810 let lc = content.to_lowercase();
1811 let close = lc
1812 .find("<channel|>")
1813 .map(|i| (i, "<channel|>".len()))
1814 .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1815
1816 if let Some((tag_start, tag_len)) = close {
1817 let before = &content[..tag_start];
1819 content_buffer.push_str(before);
1820 if !content_buffer.trim().is_empty() {
1821 let _ = tx
1822 .send(InferenceEvent::Thought(content_buffer.clone()))
1823 .await;
1824 emitted_any_content = true;
1825 }
1826 content_buffer.clear();
1827
1828 past_think = true;
1829 let after = content[tag_start + tag_len..].trim_start_matches('\n');
1830 content_buffer.push_str(after);
1831 } else {
1832 content_buffer.push_str(content);
1834 if content_buffer.len() > 30
1835 && (content.contains('\n') || content.contains('.'))
1836 {
1837 let _ = tx
1838 .send(InferenceEvent::Thought(content_buffer.clone()))
1839 .await;
1840 emitted_any_content = true;
1841 content_buffer.clear();
1842 }
1843 }
1844 } else {
1845 content_buffer.push_str(content);
1847 let is_boundary = content.contains(' ')
1848 || content.contains('.')
1849 || content.contains('!')
1850 || content.contains('?');
1851
1852 if content_buffer.len() > 10 && is_boundary {
1853 if !emitted_live_status {
1854 let _ = tx
1855 .send(InferenceEvent::ProviderStatus {
1856 state: ProviderRuntimeState::Live,
1857 summary: String::new(),
1858 })
1859 .await;
1860 emitted_live_status = true;
1861 }
1862 let _ =
1863 tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1864 emitted_any_content = true;
1865 content_buffer.clear();
1866 }
1867 }
1868 }
1869 }
1870 }
1871 }
1872
1873 if !content_buffer.is_empty() {
1875 if past_think {
1876 if !emitted_live_status {
1877 let _ = tx
1878 .send(InferenceEvent::ProviderStatus {
1879 state: ProviderRuntimeState::Live,
1880 summary: String::new(),
1881 })
1882 .await;
1883 }
1884 let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1885 } else {
1886 let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1887 }
1888 emitted_any_content = true;
1889 }
1890
1891 if !emitted_any_content {
1892 let _ = tx
1893 .send(InferenceEvent::ProviderStatus {
1894 state: ProviderRuntimeState::EmptyResponse,
1895 summary: compact_runtime_failure_summary(
1896 "empty_model_response",
1897 "Empty response from model",
1898 ),
1899 })
1900 .await;
1901 let _ = tx
1902 .send(InferenceEvent::Error(format_runtime_failure_message(
1903 "Empty response from model",
1904 )))
1905 .await;
1906 let _ = tx.send(InferenceEvent::Done).await;
1907 return Ok(());
1908 }
1909
1910 let _ = tx.send(InferenceEvent::Done).await;
1911 Ok(())
1912 }
1913
1914 pub async fn stream_generation(
1916 &self,
1917 prompt: &str,
1918 snark: u8,
1919 chaos: u8,
1920 brief: bool,
1921 professional: bool,
1922 tx: mpsc::Sender<InferenceEvent>,
1923 ) -> Result<(), Box<dyn std::error::Error>> {
1924 let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1925 let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1926 self.stream_messages(&messages, tx).await
1927 }
1928
1929 pub async fn generate_task_worker(
1933 &self,
1934 prompt: &str,
1935 professional: bool,
1936 ) -> Result<String, String> {
1937 let current_model = self.current_model();
1938 let model = self
1939 .worker_model
1940 .as_deref()
1941 .unwrap_or(current_model.as_str());
1942 self.generate_task_with_model(prompt, 0.1, professional, model)
1943 .await
1944 }
1945
1946 pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1947 self.generate_task_with_temp(prompt, 0.1, professional)
1948 .await
1949 }
1950
1951 pub async fn generate_task_with_temp(
1952 &self,
1953 prompt: &str,
1954 temp: f32,
1955 professional: bool,
1956 ) -> Result<String, String> {
1957 let current_model = self.current_model();
1958 self.generate_task_with_model(prompt, temp, professional, ¤t_model)
1959 .await
1960 }
1961
1962 pub async fn generate_task_with_model(
1963 &self,
1964 prompt: &str,
1965 temp: f32,
1966 professional: bool,
1967 model: &str,
1968 ) -> Result<String, String> {
1969 let _permit = self
1970 .kv_semaphore
1971 .acquire()
1972 .await
1973 .map_err(|e| e.to_string())?;
1974
1975 let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1976 let request_messages = if should_use_gemma_native_formatting(self, model) {
1977 prepare_gemma_native_messages(&[
1978 ChatMessage::system(&system),
1979 ChatMessage::user(prompt),
1980 ])
1981 } else {
1982 vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1983 };
1984 let request = ChatRequest {
1985 model: model.to_string(),
1986 messages: request_messages,
1987 temperature: temp,
1988 stream: false,
1989 tools: None,
1990 };
1991
1992 preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1993
1994 let res = self
1995 .client
1996 .post(&self.api_url)
1997 .json(&request)
1998 .send()
1999 .await
2000 .map_err(|e| format!("LM Studio request failed: {}", e))?;
2001
2002 let body: ChatResponse = res
2003 .json()
2004 .await
2005 .map_err(|e| format!("Failed to parse response: {}", e))?;
2006
2007 body.choices
2008 .first()
2009 .and_then(|c| c.message.content.clone())
2010 .ok_or_else(|| "Empty response from model".to_string())
2011 }
2012
2013 #[allow(dead_code)]
2017 pub fn snip_history(
2018 &self,
2019 turns: &[ChatMessage],
2020 max_tokens_estimate: usize,
2021 keep_recent: usize,
2022 ) -> Vec<ChatMessage> {
2023 let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
2024 if total_chars / 4 <= max_tokens_estimate {
2025 return turns.to_vec();
2026 }
2027 let keep = keep_recent.min(turns.len());
2028 let mut snipped = vec![turns[0].clone()];
2029 if turns.len() > keep + 1 {
2030 snipped.push(ChatMessage::system(&format!(
2031 "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2032 turns.len() - keep - 1
2033 )));
2034 snipped.extend_from_slice(&turns[turns.len() - keep..]);
2035 } else {
2036 snipped = turns.to_vec();
2037 }
2038 snipped
2039 }
2040}
2041
2042fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2043 serde_json::to_vec(value)
2044 .ok()
2045 .map_or(0, |bytes| bytes.len() / 4 + 1)
2046}
2047
2048const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2049
2050fn estimate_message_tokens(message: &ChatMessage) -> usize {
2051 let content_tokens = match &message.content {
2052 MessageContent::Text(s) => s.len() / 4 + 1,
2053 MessageContent::Parts(parts) => parts
2054 .iter()
2055 .map(|part| match part {
2056 ContentPart::Text { text } => text.len() / 4 + 1,
2057 ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2060 })
2061 .sum(),
2062 };
2063 let tool_tokens: usize = message
2064 .tool_calls
2065 .iter()
2066 .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2067 .sum();
2068 content_tokens + tool_tokens + 6
2069}
2070
2071pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2072 messages.iter().map(estimate_message_tokens).sum()
2073}
2074
2075fn reserved_output_tokens(context_length: usize) -> usize {
2076 let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2077 proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2078}
2079
2080pub fn estimate_prompt_pressure(
2081 messages: &[ChatMessage],
2082 tools: &[ToolDefinition],
2083 context_length: usize,
2084) -> (usize, usize, usize, u8) {
2085 let estimated_input_tokens =
2086 estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2087 let reserved_output = reserved_output_tokens(context_length);
2088 let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2089 let percent = if context_length == 0 {
2090 0
2091 } else {
2092 ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2093 };
2094 (
2095 estimated_input_tokens,
2096 reserved_output,
2097 estimated_total,
2098 percent,
2099 )
2100}
2101
2102fn preflight_chat_request(
2103 model: &str,
2104 messages: &[ChatMessage],
2105 tools: &[ToolDefinition],
2106 context_length: usize,
2107) -> Result<(), String> {
2108 let (estimated_input_tokens, reserved_output, estimated_total, _) =
2109 estimate_prompt_pressure(messages, tools, context_length);
2110
2111 if estimated_total > context_length {
2112 return Err(format!(
2113 "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2114 model, estimated_input_tokens, reserved_output, estimated_total, context_length
2115 ));
2116 }
2117
2118 Ok(())
2119}
2120
2121fn load_instruction_files() -> String {
2125 use std::collections::hash_map::DefaultHasher;
2126 use std::collections::HashSet;
2127 use std::hash::{Hash, Hasher};
2128
2129 let Ok(cwd) = std::env::current_dir() else {
2130 return String::new();
2131 };
2132 let mut result = String::new();
2133 let mut seen: HashSet<u64> = HashSet::new();
2134 let mut total_chars: usize = 0;
2135 const MAX_TOTAL: usize = 12_000;
2136 const MAX_PER_FILE: usize = 4_000;
2137
2138 let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2139
2140 let mut dir = cwd.clone();
2141 for _ in 0..4 {
2142 for name in &candidates {
2143 let path = dir.join(name);
2144 if !path.exists() {
2145 continue;
2146 }
2147 let Ok(content) = std::fs::read_to_string(&path) else {
2148 continue;
2149 };
2150 if content.trim().is_empty() {
2151 continue;
2152 }
2153
2154 let mut hasher = DefaultHasher::new();
2155 content.hash(&mut hasher);
2156 let h = hasher.finish();
2157 if !seen.insert(h) {
2158 continue;
2159 }
2160
2161 let truncated = if content.len() > MAX_PER_FILE {
2162 format!("{}...[truncated]", &content[..MAX_PER_FILE])
2163 } else {
2164 content
2165 };
2166
2167 if total_chars + truncated.len() > MAX_TOTAL {
2168 break;
2169 }
2170 total_chars += truncated.len();
2171 result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2172 }
2173 match dir.parent().map(|p| p.to_owned()) {
2174 Some(p) => dir = p,
2175 None => break,
2176 }
2177 }
2178
2179 if result.is_empty() {
2180 return String::new();
2181 }
2182 format!("\n\n# Project Instructions\n{}", result)
2183}
2184
2185pub fn extract_think_block(text: &str) -> Option<String> {
2186 let lower = text.to_lowercase();
2187
2188 let open_tag = "<|channel>thought";
2190 let close_tag = "<channel|>";
2191
2192 let start_pos = lower.find(open_tag)?;
2193 let content_start = start_pos + open_tag.len();
2194
2195 let close_pos = lower[content_start..]
2196 .find(close_tag)
2197 .map(|p| content_start + p)
2198 .unwrap_or(text.len());
2199
2200 let content = text[content_start..close_pos].trim();
2201 if content.is_empty() {
2202 None
2203 } else {
2204 Some(content.to_string())
2205 }
2206}
2207
2208pub fn strip_think_blocks(text: &str) -> String {
2209 let text = {
2213 let t = text.trim_start();
2214 if t.to_lowercase().starts_with("</think>") {
2215 &t[8..]
2216 } else {
2217 text
2218 }
2219 };
2220
2221 let lower = text.to_lowercase();
2222
2223 if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2225 let answer = text[end..]
2226 .replace("<|channel>thought", "")
2227 .replace("<channel|>", "");
2228 return answer.trim().replace("\n\n\n", "\n\n").to_string();
2229 }
2230
2231 let first_open = [
2233 lower.find("<|channel>thought"), lower.find("<think>"),
2235 lower.find("<thought>"),
2236 lower.find("<|think|>"),
2237 ]
2238 .iter()
2239 .filter_map(|&x| x)
2240 .min();
2241
2242 if let Some(start) = first_open {
2243 if start > 0 {
2244 return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2245 }
2246 return String::new();
2247 }
2248
2249 let naked_reasoning_phrases: &[&str] = &[
2253 "the user asked",
2254 "the user is asking",
2255 "the user wants",
2256 "i will structure",
2257 "i should provide",
2258 "i should give",
2259 "i should avoid",
2260 "i should note",
2261 "i should focus",
2262 "i should keep",
2263 "i should respond",
2264 "i should present",
2265 "i should display",
2266 "i should show",
2267 "i need to",
2268 "i can see from",
2269 "without being overly",
2270 "let me ",
2271 "necessary information in my identity",
2272 "was computed successfully",
2273 "computed successfully",
2274 ];
2275 let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2276 if is_naked_reasoning {
2277 let lines: Vec<&str> = text.lines().collect();
2278 if !lines.is_empty() {
2279 let mut start_idx = 0;
2282 for (i, line) in lines.iter().enumerate() {
2283 let l = line.to_lowercase();
2284 let is_reasoning_line =
2285 naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2286 if is_reasoning_line {
2287 start_idx = i + 1;
2288 } else {
2289 break;
2290 }
2291 }
2292 if start_idx < lines.len() {
2293 return lines[start_idx..]
2294 .join("\n")
2295 .trim()
2296 .replace("\n\n\n", "\n\n")
2297 .to_string();
2298 }
2299 return String::new();
2301 }
2302 }
2303
2304 let cleaned = strip_xml_tool_call_artifacts(text);
2307 cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2308}
2309
2310fn strip_xml_tool_call_artifacts(text: &str) -> String {
2313 const XML_ARTIFACTS: &[&str] = &[
2315 "</tool_call>",
2316 "<tool_call>",
2317 "</function>",
2318 "<function>",
2319 "</parameter>",
2320 "<parameter>",
2321 "</arguments>",
2322 "<arguments>",
2323 "</tool_use>",
2324 "<tool_use>",
2325 "</invoke>",
2326 "<invoke>",
2327 "</think>",
2329 "</thought>",
2330 "</thinking>",
2331 ];
2332 let mut out = text.to_string();
2333 for tag in XML_ARTIFACTS {
2334 while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2336 out.drain(pos..pos + tag.len());
2337 }
2338 }
2339 out
2341}
2342
2343pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2346 use regex::Regex;
2347 let mut results = Vec::new();
2348
2349 let re_call = Regex::new(
2351 r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2352 ).unwrap();
2353 let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2354
2355 for cap in re_call.captures_iter(text) {
2356 let name = cap[1].to_string();
2357 let args_str = &cap[2];
2358 let mut arguments = serde_json::Map::new();
2359
2360 for arg_cap in re_arg.captures_iter(args_str) {
2361 let key = arg_cap[1].to_string();
2362 let val_raw = arg_cap
2363 .get(2)
2364 .map(|m| m.as_str())
2365 .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2366 .unwrap_or("")
2367 .trim();
2368 let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2369
2370 let val = if normalized_raw == "true" {
2371 Value::Bool(true)
2372 } else if normalized_raw == "false" {
2373 Value::Bool(false)
2374 } else if let Ok(n) = normalized_raw.parse::<i64>() {
2375 Value::Number(n.into())
2376 } else if let Ok(n) = normalized_raw.parse::<u64>() {
2377 Value::Number(n.into())
2378 } else if let Ok(n) = normalized_raw.parse::<f64>() {
2379 serde_json::Number::from_f64(n)
2380 .map(Value::Number)
2381 .unwrap_or(Value::String(normalized_raw.clone()))
2382 } else {
2383 Value::String(normalized_raw)
2384 };
2385
2386 arguments.insert(key, val);
2387 }
2388
2389 results.push(ToolCallResponse {
2390 id: format!("call_{}", rand::random::<u32>()),
2391 call_type: "function".to_string(),
2392 function: ToolCallFn {
2393 name,
2394 arguments: Value::Object(arguments).to_string(),
2395 },
2396 });
2397 }
2398
2399 let re_xml_call = Regex::new(
2401 r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
2402 ).unwrap();
2403 let re_xml_param =
2404 Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
2405
2406 for cap in re_xml_call.captures_iter(text) {
2407 let name = cap[1].to_string();
2408 let body = &cap[2];
2409 let mut arguments = serde_json::Map::new();
2410
2411 for p_cap in re_xml_param.captures_iter(body) {
2412 let key = p_cap[1].to_string();
2413 let val_raw = p_cap[2].trim();
2414 let val = if val_raw == "true" {
2415 Value::Bool(true)
2416 } else if val_raw == "false" {
2417 Value::Bool(false)
2418 } else if let Ok(n) = val_raw.parse::<i64>() {
2419 Value::Number(n.into())
2420 } else if let Ok(n) = val_raw.parse::<u64>() {
2421 Value::Number(n.into())
2422 } else {
2423 Value::String(val_raw.to_string())
2424 };
2425 arguments.insert(key, val);
2426 }
2427
2428 if !arguments.is_empty() || name == "clear_context" {
2429 results.push(ToolCallResponse {
2430 id: format!("call_{}", rand::random::<u32>()),
2431 call_type: "function".to_string(),
2432 function: ToolCallFn {
2433 name,
2434 arguments: Value::Object(arguments).to_string(),
2435 },
2436 });
2437 }
2438 }
2439
2440 results
2441}
2442
2443pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2444 let trimmed = raw.trim();
2445 let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2446
2447 let mut value = match serde_json::from_str::<Value>(&candidate) {
2448 Ok(v) => v,
2449 Err(_) => return candidate,
2450 };
2451 normalize_tool_argument_value(tool_name, &mut value);
2452 value.to_string()
2453}
2454
2455fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2456 match value {
2457 Value::String(s) => *s = normalize_string_arg(s),
2458 Value::Array(items) => {
2459 for item in items {
2460 normalize_tool_argument_value(tool_name, item);
2461 }
2462 }
2463 Value::Object(map) => {
2464 for val in map.values_mut() {
2465 normalize_tool_argument_value(tool_name, val);
2466 }
2467 if tool_name == "grep_files" {
2468 if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2469 *pattern = normalize_regex_pattern(pattern);
2470 }
2471 }
2472 for key in ["path", "extension", "query", "command", "reason"] {
2473 if let Some(Value::String(s)) = map.get_mut(key) {
2474 *s = normalize_string_arg(s);
2475 }
2476 }
2477 }
2478 _ => {}
2479 }
2480}
2481
2482fn unwrap_json_string_once(input: &str) -> Option<String> {
2483 if input.len() < 2 {
2484 return None;
2485 }
2486 let first = input.chars().next()?;
2487 let last = input.chars().last()?;
2488 if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2489 return None;
2490 }
2491 let inner = &input[1..input.len() - 1];
2492 let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2493 Some(unescaped.trim().to_string())
2494}
2495
2496fn normalize_string_arg(input: &str) -> String {
2497 let mut out = input.trim().to_string();
2498 while out.len() >= 2 {
2499 let mut changed = false;
2500 for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2501 if out.starts_with(start) && out.ends_with(end) {
2502 out = out[start.len()..out.len() - end.len()].trim().to_string();
2503 changed = true;
2504 break;
2505 }
2506 }
2507 if !changed {
2508 break;
2509 }
2510 }
2511 out
2512}
2513
2514fn normalize_regex_pattern(input: &str) -> String {
2515 let out = normalize_string_arg(input);
2516 if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2517 out[1..out.len() - 1].to_string()
2518 } else {
2519 out
2520 }
2521}
2522
2523fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2524 let mut system_blocks = Vec::new();
2525 let mut prepared = Vec::new();
2526 let mut seeded = false;
2527
2528 for message in messages {
2529 if message.role == "system" {
2530 let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2531 .trim()
2532 .to_string();
2533 if !cleaned.is_empty() {
2534 system_blocks.push(cleaned);
2535 }
2536 continue;
2537 }
2538
2539 let mut clone = message.clone();
2540 clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2541
2542 if !seeded && message.role == "user" {
2543 let mut merged = String::new();
2544 if !system_blocks.is_empty() {
2545 merged.push_str("System instructions for this turn:\n");
2546 merged.push_str(&system_blocks.join("\n\n"));
2547 merged.push_str("\n\n");
2548 }
2549 merged.push_str(clone.content.as_str());
2550 clone.content = MessageContent::Text(merged);
2551 seeded = true;
2552 }
2553
2554 prepared.push(clone);
2555 }
2556
2557 if !seeded && !system_blocks.is_empty() {
2558 prepared.insert(
2559 0,
2560 ChatMessage::user(&format!(
2561 "System instructions for this turn:\n{}",
2562 system_blocks.join("\n\n")
2563 )),
2564 );
2565 }
2566
2567 prepared
2568}
2569
2570fn strip_legacy_turn_wrappers(text: &str) -> String {
2571 text.replace("<|turn>system\n", "")
2572 .replace("<|turn>user\n", "")
2573 .replace("<|turn>assistant\n", "")
2574 .replace("<|turn>tool\n", "")
2575 .replace("<turn|>", "")
2576 .trim()
2577 .to_string()
2578}
2579
2580pub fn strip_native_tool_call_text(text: &str) -> String {
2581 use regex::Regex;
2582 let re_call = Regex::new(
2584 r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2585 ).unwrap();
2586 let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
2588 let re_response =
2589 Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2590 .unwrap();
2591 let without_calls = re_call.replace_all(text, "");
2592 let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
2593 re_response
2594 .replace_all(without_xml.as_ref(), "")
2595 .trim()
2596 .to_string()
2597}
2598
2599#[cfg(test)]
2600mod tests {
2601 use super::*;
2602
2603 #[test]
2604 fn system_prompt_includes_running_hematite_version() {
2605 let engine = InferenceEngine::new(
2606 "http://localhost:1234/v1".to_string(),
2607 "strategist".to_string(),
2608 0,
2609 )
2610 .expect("engine");
2611
2612 let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2613 assert!(system.contains(crate::HEMATITE_VERSION));
2614 }
2615
2616 #[test]
2617 fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2618 let text = r#"<|channel>thought
2619Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2620<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2621
2622 let calls = extract_native_tool_calls(text);
2623 assert_eq!(calls.len(), 1);
2624 assert_eq!(calls[0].function.name, "read_file");
2625
2626 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2627 assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2628 assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2629 assert_eq!(
2630 args.get("path").and_then(|v| v.as_str()),
2631 Some("src/ui/tui.rs")
2632 );
2633
2634 let stripped = strip_native_tool_call_text(text);
2635 assert!(!stripped.contains("<|tool_call"));
2636 assert!(!stripped.contains("<tool_call|>"));
2637 }
2638
2639 #[test]
2640 fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2641 let text = r#"<|channel>thought
2642Planning.
2643<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2644Mapped src.
2645<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2646Read main.
2647<channel|>"#;
2648
2649 let calls = extract_native_tool_calls(text);
2650 assert_eq!(calls.len(), 2);
2651 assert_eq!(calls[0].function.name, "list_files");
2652 assert_eq!(calls[1].function.name, "read_file");
2653
2654 let stripped = strip_native_tool_call_text(text);
2655 assert!(!stripped.contains("<|tool_call"));
2656 assert!(!stripped.contains("<|tool_response"));
2657 assert!(!stripped.contains("<tool_response|>"));
2658 }
2659
2660 #[test]
2661 fn extracts_qwen_xml_tool_calls_from_reasoning() {
2662 let text = r#"Based on the project structure, I need to check the binary.
2663<tool_call>
2664<function=shell>
2665<parameter=command>
2666ls -la hematite.exe
2667</parameter>
2668<parameter=reason>
2669Check if the binary exists
2670</parameter>
2671</function>
2672</tool_call>"#;
2673
2674 let calls = extract_native_tool_calls(text);
2675 assert_eq!(calls.len(), 1);
2676 assert_eq!(calls[0].function.name, "shell");
2677
2678 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2679 assert_eq!(
2680 args.get("command").and_then(|v| v.as_str()),
2681 Some("ls -la hematite.exe")
2682 );
2683 assert_eq!(
2684 args.get("reason").and_then(|v| v.as_str()),
2685 Some("Check if the binary exists")
2686 );
2687
2688 let stripped = strip_native_tool_call_text(text);
2689 assert!(!stripped.contains("<tool_call>"));
2690 assert!(!stripped.contains("<function=shell>"));
2691 }
2692}