1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7pub struct InferenceEngine {
10 pub client: reqwest::Client,
11 pub api_url: String,
12 pub base_url: String,
15 pub species: String,
16 pub snark: u8,
17 pub kv_semaphore: Semaphore,
18 pub model: std::sync::RwLock<String>,
20 pub context_length: std::sync::atomic::AtomicUsize,
22 pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23 pub worker_model: Option<String>,
25 pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27 pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32 let lower = model.to_ascii_lowercase();
33 lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37 is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44 #[serde(rename = "type")]
45 pub tool_type: String,
46 pub function: ToolFunction,
47 #[serde(skip_serializing, skip_deserializing)]
48 pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53 pub name: String,
54 pub description: String,
55 pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60 RepoRead,
61 RepoWrite,
62 Runtime,
63 Architecture,
64 Toolchain,
65 Verification,
66 Git,
67 Research,
68 Vision,
69 Lsp,
70 Workflow,
71 External,
72 Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77 pub category: ToolCategory,
78 pub mutates_workspace: bool,
79 pub external_surface: bool,
80 pub trust_sensitive: bool,
81 pub read_only_friendly: bool,
82 pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86 if name.starts_with("mcp__") {
87 let lower = name.to_ascii_lowercase();
88 let mutates_workspace = [
89 "__edit",
90 "__write",
91 "__create",
92 "__move",
93 "__delete",
94 "__remove",
95 "__rename",
96 "__replace",
97 "__patch",
98 ]
99 .iter()
100 .any(|needle| lower.contains(needle));
101 return ToolMetadata {
102 category: ToolCategory::External,
103 mutates_workspace,
104 external_surface: true,
105 trust_sensitive: true,
106 read_only_friendly: !mutates_workspace,
107 plan_scope: false,
108 };
109 }
110
111 match name {
112 "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113 category: ToolCategory::RepoRead,
114 mutates_workspace: false,
115 external_surface: false,
116 trust_sensitive: false,
117 read_only_friendly: true,
118 plan_scope: true,
119 },
120 "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121 category: ToolCategory::RepoWrite,
122 mutates_workspace: true,
123 external_surface: false,
124 trust_sensitive: true,
125 read_only_friendly: false,
126 plan_scope: true,
127 },
128 "trace_runtime_flow" => ToolMetadata {
129 category: ToolCategory::Architecture,
130 mutates_workspace: false,
131 external_surface: false,
132 trust_sensitive: false,
133 read_only_friendly: true,
134 plan_scope: false,
135 },
136 "describe_toolchain" => ToolMetadata {
137 category: ToolCategory::Toolchain,
138 mutates_workspace: false,
139 external_surface: false,
140 trust_sensitive: false,
141 read_only_friendly: true,
142 plan_scope: false,
143 },
144 "shell" => ToolMetadata {
145 category: ToolCategory::Runtime,
146 mutates_workspace: true,
147 external_surface: false,
148 trust_sensitive: true,
149 read_only_friendly: false,
150 plan_scope: false,
151 },
152 "inspect_host" => ToolMetadata {
153 category: ToolCategory::Runtime,
154 mutates_workspace: false,
155 external_surface: false,
156 trust_sensitive: false,
157 read_only_friendly: true,
158 plan_scope: false,
159 },
160 "resolve_host_issue" => ToolMetadata {
161 category: ToolCategory::Runtime,
162 mutates_workspace: true,
163 external_surface: true,
164 trust_sensitive: true,
165 read_only_friendly: false,
166 plan_scope: false,
167 },
168 "run_hematite_maintainer_workflow" => ToolMetadata {
169 category: ToolCategory::Workflow,
170 mutates_workspace: true,
171 external_surface: false,
172 trust_sensitive: true,
173 read_only_friendly: false,
174 plan_scope: false,
175 },
176 "run_workspace_workflow" => ToolMetadata {
177 category: ToolCategory::Workflow,
178 mutates_workspace: true,
179 external_surface: false,
180 trust_sensitive: true,
181 read_only_friendly: false,
182 plan_scope: false,
183 },
184 "verify_build" => ToolMetadata {
185 category: ToolCategory::Verification,
186 mutates_workspace: false,
187 external_surface: false,
188 trust_sensitive: false,
189 read_only_friendly: true,
190 plan_scope: false,
191 },
192 "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
193 ToolMetadata {
194 category: ToolCategory::Git,
195 mutates_workspace: true,
196 external_surface: false,
197 trust_sensitive: true,
198 read_only_friendly: false,
199 plan_scope: false,
200 }
201 }
202 "research_web" | "fetch_docs" => ToolMetadata {
203 category: ToolCategory::Research,
204 mutates_workspace: false,
205 external_surface: false,
206 trust_sensitive: false,
207 read_only_friendly: true,
208 plan_scope: false,
209 },
210 "vision_analyze" => ToolMetadata {
211 category: ToolCategory::Vision,
212 mutates_workspace: false,
213 external_surface: false,
214 trust_sensitive: false,
215 read_only_friendly: true,
216 plan_scope: false,
217 },
218 "lsp_definitions"
219 | "lsp_references"
220 | "lsp_hover"
221 | "lsp_rename_symbol"
222 | "lsp_get_diagnostics"
223 | "lsp_search_symbol" => ToolMetadata {
224 category: ToolCategory::Lsp,
225 mutates_workspace: false,
226 external_surface: false,
227 trust_sensitive: false,
228 read_only_friendly: true,
229 plan_scope: false,
230 },
231 "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
232 category: ToolCategory::Workflow,
233 mutates_workspace: false,
234 external_surface: false,
235 trust_sensitive: false,
236 read_only_friendly: true,
237 plan_scope: true,
238 },
239 "manage_tasks" => ToolMetadata {
240 category: ToolCategory::Workflow,
241 mutates_workspace: false,
242 external_surface: false,
243 trust_sensitive: false,
244 read_only_friendly: true,
245 plan_scope: false,
246 },
247 _ => ToolMetadata {
248 category: ToolCategory::Other,
249 mutates_workspace: false,
250 external_surface: false,
251 trust_sensitive: false,
252 read_only_friendly: true,
253 plan_scope: false,
254 },
255 }
256}
257
258#[derive(Serialize, Deserialize, Clone, Debug)]
263pub struct ChatMessage {
264 pub role: String,
265 pub content: MessageContent,
267 #[serde(default, skip_serializing_if = "Vec::is_empty")]
269 pub tool_calls: Vec<ToolCallResponse>,
270 #[serde(skip_serializing_if = "Option::is_none")]
272 pub tool_call_id: Option<String>,
273 #[serde(skip_serializing_if = "Option::is_none")]
275 pub name: Option<String>,
276}
277
278#[derive(Serialize, Deserialize, Clone, Debug)]
279#[serde(untagged)]
280pub enum MessageContent {
281 Text(String),
282 Parts(Vec<ContentPart>),
283}
284
285#[derive(Serialize, Deserialize, Clone, Debug)]
286#[serde(tag = "type")]
287pub enum ContentPart {
288 #[serde(rename = "text")]
289 Text { text: String },
290 #[serde(rename = "image_url")]
291 ImageUrl { image_url: ImageUrlSource },
292}
293
294#[derive(Serialize, Deserialize, Clone, Debug)]
295pub struct ImageUrlSource {
296 pub url: String,
297}
298
299impl Default for MessageContent {
300 fn default() -> Self {
301 MessageContent::Text(String::new())
302 }
303}
304
305impl MessageContent {
306 pub fn as_str(&self) -> &str {
307 match self {
308 MessageContent::Text(s) => s,
309 MessageContent::Parts(parts) => {
310 for part in parts {
311 if let ContentPart::Text { text } = part {
312 return text;
313 }
314 }
315 ""
316 }
317 }
318 }
319}
320
321impl ChatMessage {
322 pub fn system(content: &str) -> Self {
323 Self {
324 role: "system".into(),
325 content: MessageContent::Text(content.into()),
326 tool_calls: Vec::new(),
327 tool_call_id: None,
328 name: None,
329 }
330 }
331 pub fn user(content: &str) -> Self {
332 Self {
333 role: "user".into(),
334 content: MessageContent::Text(content.into()),
335 tool_calls: Vec::new(),
336 tool_call_id: None,
337 name: None,
338 }
339 }
340 pub fn user_with_image(text: &str, image_url: &str) -> Self {
341 let mut text_parts = text.to_string();
342 if !text_parts.contains("<|image|>") {
343 text_parts.push_str(" <|image|>");
344 }
345 Self {
346 role: "user".into(),
347 content: MessageContent::Parts(vec![
348 ContentPart::Text { text: text_parts },
349 ContentPart::ImageUrl {
350 image_url: ImageUrlSource {
351 url: image_url.into(),
352 },
353 },
354 ]),
355 tool_calls: Vec::new(),
356 tool_call_id: None,
357 name: None,
358 }
359 }
360 pub fn assistant_text(content: &str) -> Self {
361 Self {
362 role: "assistant".into(),
363 content: MessageContent::Text(content.into()),
364 tool_calls: Vec::new(),
365 tool_call_id: None,
366 name: None,
367 }
368 }
369 pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
370 Self {
371 role: "assistant".into(),
372 content: MessageContent::Text(content.into()),
373 tool_calls: calls,
374 tool_call_id: None,
375 name: None,
376 }
377 }
378 pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
379 Self::tool_result_for_model(tool_call_id, fn_name, content, "")
380 }
381
382 pub fn tool_result_for_model(
385 tool_call_id: &str,
386 fn_name: &str,
387 content: &str,
388 model: &str,
389 ) -> Self {
390 let body = if is_gemma4_model_name(model) {
391 format!(
392 "<|tool_response>response:{}{}{}<tool_response|>",
393 fn_name, "{", content
394 )
395 } else {
396 content.to_string()
397 };
398 Self {
399 role: "tool".into(),
400 content: MessageContent::Text(body),
401 tool_calls: Vec::new(),
402 tool_call_id: Some(tool_call_id.into()),
403 name: Some(fn_name.into()),
404 }
405 }
406}
407
408#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallResponse {
412 pub id: String,
413 #[serde(rename = "type")]
414 pub call_type: String,
415 pub function: ToolCallFn,
416}
417
418#[derive(Serialize, Deserialize, Clone, Debug)]
419pub struct ToolCallFn {
420 pub name: String,
421 pub arguments: String,
423}
424
425#[derive(Serialize)]
428struct ChatRequest {
429 model: String,
430 messages: Vec<ChatMessage>,
431 temperature: f32,
432 stream: bool,
433 #[serde(skip_serializing_if = "Option::is_none")]
434 tools: Option<Vec<ToolDefinition>>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ChatResponse {
439 choices: Vec<ResponseChoice>,
440 usage: Option<TokenUsage>,
441}
442
443#[derive(Deserialize, Debug, Clone)]
444pub struct TokenUsage {
445 pub prompt_tokens: usize,
446 pub completion_tokens: usize,
447 pub total_tokens: usize,
448 #[serde(default)]
449 pub prompt_cache_hit_tokens: usize,
450 #[serde(default)]
451 pub cache_read_input_tokens: usize,
452}
453
454#[derive(Deserialize, Debug)]
455struct ResponseChoice {
456 message: ResponseMessage,
457 #[serde(default)]
458 finish_reason: Option<String>,
459}
460
461#[derive(Deserialize, Debug)]
462struct ResponseMessage {
463 content: Option<String>,
464 tool_calls: Option<Vec<ToolCallResponse>>,
465 #[serde(default)]
469 reasoning_content: Option<String>,
470}
471
472const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
473const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
474
475fn is_tiny_context_window(context_length: usize) -> bool {
476 context_length <= 8_192
477}
478
479fn is_compact_context_window(context_length: usize) -> bool {
480 context_length > 8_192 && context_length <= 49_152
481}
482
483pub fn is_compact_context_window_pub(context_length: usize) -> bool {
484 is_compact_context_window(context_length)
485}
486
487fn is_provider_context_limit_detail(lower: &str) -> bool {
488 (lower.contains("n_keep") && lower.contains("n_ctx"))
489 || lower.contains("context length")
490 || lower.contains("keep from the initial prompt")
491 || lower.contains("prompt is greater than the context length")
492 || lower.contains("exceeds the context window")
493}
494
495fn classify_runtime_failure_tag(detail: &str) -> &'static str {
496 let lower = detail.to_ascii_lowercase();
497 if lower.contains("context_window_blocked")
498 || lower.contains("context ceiling reached")
499 || lower.contains("exceeds the")
500 || is_provider_context_limit_detail(&lower)
501 {
502 "context_window"
503 } else if lower.contains("empty response from model")
504 || lower.contains("model returned an empty response")
505 {
506 "empty_model_response"
507 } else if lower.contains("action blocked:")
508 || lower.contains("access denied")
509 || lower.contains("declined by user")
510 {
511 "tool_policy_blocked"
512 } else {
513 "provider_degraded"
514 }
515}
516
517fn runtime_failure_guidance(tag: &str) -> &'static str {
518 match tag {
519 "context_window" => {
520 "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
521 }
522 "empty_model_response" => {
523 "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
524 }
525 "tool_policy_blocked" => {
526 "Stay inside the allowed workflow or switch modes before retrying."
527 }
528 _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
529 }
530}
531
532fn format_runtime_failure_message(detail: &str) -> String {
533 let tag = classify_runtime_failure_tag(detail);
534 format!(
535 "[failure:{}] {} Detail: {}",
536 tag,
537 runtime_failure_guidance(tag),
538 detail.trim()
539 )
540}
541
542#[derive(Debug, Clone, Copy, PartialEq, Eq)]
543pub enum ProviderRuntimeState {
544 Booting,
545 Live,
546 Recovering,
547 Degraded,
548 ContextWindow,
549 EmptyResponse,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum McpRuntimeState {
554 Unconfigured,
555 Healthy,
556 Degraded,
557 Failed,
558}
559
560#[derive(Debug, Clone, Copy, PartialEq, Eq)]
561pub enum OperatorCheckpointState {
562 Idle,
563 RecoveringProvider,
564 BudgetReduced,
565 HistoryCompacted,
566 BlockedContextWindow,
567 BlockedPolicy,
568 BlockedRecentFileEvidence,
569 BlockedExactLineWindow,
570 BlockedToolLoop,
571 BlockedVerification,
572}
573
574impl OperatorCheckpointState {
575 pub fn label(self) -> &'static str {
576 match self {
577 OperatorCheckpointState::Idle => "idle",
578 OperatorCheckpointState::RecoveringProvider => "recovering_provider",
579 OperatorCheckpointState::BudgetReduced => "budget_reduced",
580 OperatorCheckpointState::HistoryCompacted => "history_compacted",
581 OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
582 OperatorCheckpointState::BlockedPolicy => "blocked_policy",
583 OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
584 OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
585 OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
586 OperatorCheckpointState::BlockedVerification => "blocked_verification",
587 }
588 }
589}
590
591fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
592 match tag {
593 "context_window" => ProviderRuntimeState::ContextWindow,
594 "empty_model_response" => ProviderRuntimeState::EmptyResponse,
595 _ => ProviderRuntimeState::Degraded,
596 }
597}
598
599fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
600 match tag {
601 "context_window" => {
602 "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
603 .to_string()
604 }
605 "empty_model_response" => {
606 "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
607 .to_string()
608 }
609 "tool_policy_blocked" => {
610 "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
611 .to_string()
612 }
613 _ => {
614 let mut excerpt = detail
615 .split_whitespace()
616 .take(12)
617 .collect::<Vec<_>>()
618 .join(" ");
619 if excerpt.len() > 110 {
620 excerpt.truncate(110);
621 excerpt.push_str("...");
622 }
623 if excerpt.is_empty() {
624 "LM Studio degraded; Hematite will retry once before surfacing a failure."
625 .to_string()
626 } else {
627 format!("LM Studio degraded: {}", excerpt)
628 }
629 }
630 }
631}
632
633#[derive(Debug)]
636pub enum InferenceEvent {
637 Token(String),
639 MutedToken(String),
641 Thought(String),
643 VoiceStatus(String),
645 ToolCallStart {
647 id: String,
648 name: String,
649 args: String,
650 },
651 ToolCallResult {
653 id: String,
654 name: String,
655 output: String,
656 is_error: bool,
657 },
658 ApprovalRequired {
662 id: String,
663 name: String,
664 display: String,
665 diff: Option<String>,
668 responder: tokio::sync::oneshot::Sender<bool>,
669 },
670 Done,
672 Error(String),
674 ProviderStatus {
676 state: ProviderRuntimeState,
677 summary: String,
678 },
679 OperatorCheckpoint {
681 state: OperatorCheckpointState,
682 summary: String,
683 },
684 RecoveryRecipe { summary: String },
686 McpStatus {
688 state: McpRuntimeState,
689 summary: String,
690 },
691 CompactionPressure {
693 estimated_tokens: usize,
694 threshold_tokens: usize,
695 percent: u8,
696 },
697 PromptPressure {
699 estimated_input_tokens: usize,
700 reserved_output_tokens: usize,
701 estimated_total_tokens: usize,
702 context_length: usize,
703 percent: u8,
704 },
705 TaskProgress {
707 id: String,
708 label: String,
709 progress: u8,
710 },
711 UsageUpdate(TokenUsage),
713 RuntimeProfile {
715 model_id: String,
716 context_length: usize,
717 },
718 VeinStatus {
720 file_count: usize,
721 embedded_count: usize,
722 docs_only: bool,
723 },
724 VeinContext { paths: Vec<String> },
727 SoulReroll {
729 species: String,
730 rarity: String,
731 shiny: bool,
732 personality: String,
733 },
734 EmbedProfile { model_id: Option<String> },
736 ShellLine(String),
740}
741
742impl InferenceEngine {
745 pub fn new(
746 api_url: String,
747 species: String,
748 snark: u8,
749 ) -> Result<Self, Box<dyn std::error::Error>> {
750 let client = reqwest::Client::builder()
751 .timeout(std::time::Duration::from_secs(180))
752 .build()?;
753
754 let base_url = {
756 let trimmed = api_url.trim_end_matches('/');
757 if let Some(scheme_end) = trimmed.find("://") {
758 let after_scheme = &trimmed[scheme_end + 3..];
759 if let Some(path_start) = after_scheme.find('/') {
760 format!(
761 "{}://{}",
762 &trimmed[..scheme_end],
763 &after_scheme[..path_start]
764 )
765 } else {
766 trimmed.to_string()
767 }
768 } else {
769 trimmed.to_string()
770 }
771 };
772
773 let api_url = if api_url.ends_with("/chat/completions") {
774 api_url
775 } else if api_url.ends_with("/") {
776 format!("{}chat/completions", api_url)
777 } else {
778 format!("{}/chat/completions", api_url)
779 };
780
781 Ok(Self {
782 client,
783 api_url,
784 base_url,
785 species,
786 snark,
787 kv_semaphore: Semaphore::new(3),
788 model: std::sync::RwLock::new(String::new()),
789 context_length: std::sync::atomic::AtomicUsize::new(32_768), economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
791 worker_model: None,
792 gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
793 cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
794 })
795 }
796
797 pub fn set_gemma_native_formatting(&self, enabled: bool) {
798 self.gemma_native_formatting
799 .store(enabled, std::sync::atomic::Ordering::SeqCst);
800 }
801
802 pub fn gemma_native_formatting_enabled(&self) -> bool {
803 self.gemma_native_formatting
804 .load(std::sync::atomic::Ordering::SeqCst)
805 }
806
807 pub fn current_model(&self) -> String {
808 self.model.read().map(|g| g.clone()).unwrap_or_default()
809 }
810
811 pub fn current_context_length(&self) -> usize {
812 self.context_length
813 .load(std::sync::atomic::Ordering::SeqCst)
814 }
815
816 pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
817 if let Ok(mut guard) = self.model.write() {
818 *guard = model.to_string();
819 }
820 self.context_length
821 .store(context_length, std::sync::atomic::Ordering::SeqCst);
822 }
823
824 pub async fn health_check(&self) -> bool {
826 let url = format!("{}/v1/models", self.base_url);
827 match self.client.get(&url).send().await {
828 Ok(resp) => resp.status().is_success(),
829 Err(_) => false,
830 }
831 }
832
833 pub async fn get_loaded_model(&self) -> Option<String> {
841 #[derive(Deserialize)]
842 struct ModelList {
843 data: Vec<ModelEntry>,
844 }
845 #[derive(Deserialize)]
846 struct ModelEntry {
847 id: String,
848 #[serde(rename = "type", default)]
849 model_type: String,
850 #[serde(default)]
851 state: String,
852 }
853
854 if let Ok(resp) = self
856 .client
857 .get(format!("{}/api/v0/models", self.base_url))
858 .send()
859 .await
860 {
861 if let Ok(list) = resp.json::<ModelList>().await {
862 let chat_model = list
863 .data
864 .into_iter()
865 .find(|m| m.model_type != "embeddings" && m.state == "loaded")
866 .map(|m| m.id)
867 .unwrap_or_default();
868 return Some(chat_model);
869 }
870 }
871
872 let resp = self
874 .client
875 .get(format!("{}/v1/models", self.base_url))
876 .send()
877 .await
878 .ok()?;
879 let list: ModelList = resp.json().await.ok()?;
880 Some(
881 list.data
882 .into_iter()
883 .find(|m| !m.id.to_lowercase().contains("embed"))
884 .map(|m| m.id)
885 .unwrap_or_default(),
886 )
887 }
888
889 pub async fn get_embedding_model(&self) -> Option<String> {
895 #[derive(Deserialize)]
896 struct ModelList {
897 data: Vec<ModelEntry>,
898 }
899 #[derive(Deserialize)]
900 struct ModelEntry {
901 id: String,
902 #[serde(rename = "type", default)]
903 model_type: String,
904 #[serde(default)]
905 state: String,
906 }
907 let resp = self
908 .client
909 .get(format!("{}/api/v0/models", self.base_url))
910 .send()
911 .await
912 .ok()?;
913 let list: ModelList = resp.json().await.ok()?;
914 list.data
915 .into_iter()
916 .find(|m| m.model_type == "embeddings" && m.state == "loaded")
917 .map(|m| m.id)
918 }
919
920 pub async fn detect_context_length(&self) -> usize {
926 #[derive(Deserialize)]
927 struct LmStudioModel {
928 id: Option<String>,
929 #[serde(rename = "type", default)]
930 model_type: String,
931 state: Option<String>,
932 loaded_context_length: Option<u64>,
933 context_length: Option<u64>,
934 max_context_length: Option<u64>,
935 }
936 #[derive(Deserialize)]
937 struct LmStudioList {
938 data: Vec<LmStudioModel>,
939 }
940
941 if let Ok(resp) = self
943 .client
944 .get(format!("{}/api/v0/models", self.base_url))
945 .send()
946 .await
947 {
948 if let Ok(list) = resp.json::<LmStudioList>().await {
949 let target_model = self.current_model().to_ascii_lowercase();
950 let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
952 let loaded = list
953 .data
954 .iter()
955 .find(|m| {
956 non_embed(m)
957 && m.state.as_deref() == Some("loaded")
958 && m.id
959 .as_deref()
960 .map(|id| id.eq_ignore_ascii_case(&target_model))
961 .unwrap_or(false)
962 })
963 .or_else(|| {
964 list.data
965 .iter()
966 .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
967 })
968 .or_else(|| {
969 list.data.iter().find(|m| {
970 non_embed(m)
971 && m.id
972 .as_deref()
973 .map(|id| id.eq_ignore_ascii_case(&target_model))
974 .unwrap_or(false)
975 })
976 })
977 .or_else(|| list.data.iter().find(|m| non_embed(m)));
978
979 if let Some(model) = loaded {
980 if let Some(ctx) = model.loaded_context_length {
981 if ctx > 0 {
982 return ctx as usize;
983 }
984 }
985 if let Some(ctx) = model.context_length {
986 if ctx > 0 {
987 return ctx as usize;
988 }
989 }
990 if let Some(ctx) = model.max_context_length {
991 if ctx > 0 && ctx <= 32_768 {
992 return ctx as usize;
993 }
994 }
995 }
996 }
997 }
998
999 if self.current_model().to_lowercase().contains("gemma-4") {
1003 return 32_768;
1004 }
1005
1006 32_768
1007 }
1008
1009 pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
1010 let previous_model = self.current_model();
1011 let previous_context = self.current_context_length();
1012
1013 let detected_model = match self.get_loaded_model().await {
1014 Some(m) if !m.is_empty() => m, Some(_) => "no model loaded".to_string(), None => previous_model.clone(), };
1018
1019 if !detected_model.is_empty() && detected_model != previous_model {
1020 if let Ok(mut guard) = self.model.write() {
1021 *guard = detected_model.clone();
1022 }
1023 }
1024
1025 let detected_context = self.detect_context_length().await;
1026 let effective_model = if detected_model.is_empty() {
1027 previous_model.clone()
1028 } else {
1029 detected_model
1030 };
1031
1032 let changed = effective_model != previous_model || detected_context != previous_context;
1033 self.set_runtime_profile(&effective_model, detected_context);
1034
1035 Some((effective_model, detected_context, changed))
1036 }
1037
1038 pub fn build_system_prompt(
1039 &self,
1040 snark: u8,
1041 chaos: u8,
1042 brief: bool,
1043 professional: bool,
1044 tools: &[ToolDefinition],
1045 reasoning_history: Option<&str>,
1046 mcp_tools: &[crate::agent::mcp::McpTool],
1047 ) -> String {
1048 let mut sys = self.build_system_prompt_legacy(
1049 snark,
1050 chaos,
1051 brief,
1052 professional,
1053 tools,
1054 reasoning_history,
1055 );
1056
1057 if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1058 sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1059 sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1060 for tool in mcp_tools {
1061 let description = tool
1062 .description
1063 .as_deref()
1064 .unwrap_or("No description provided.");
1065 sys.push_str(&format!("- {}: {}\n", tool.name, description));
1066 }
1067 }
1068
1069 sys
1070 }
1071
1072 pub fn build_system_prompt_legacy(
1073 &self,
1074 snark: u8,
1075 _chaos: u8,
1076 brief: bool,
1077 professional: bool,
1078 tools: &[ToolDefinition],
1079 reasoning_history: Option<&str>,
1080 ) -> String {
1081 let current_context_length = self.current_context_length();
1082 if is_tiny_context_window(current_context_length) {
1083 return self.build_system_prompt_tiny(brief, professional);
1084 }
1085 if is_compact_context_window(current_context_length) {
1086 return self.build_system_prompt_compact(brief, professional, tools);
1087 }
1088
1089 let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1091 - You are Hematite, a local coding system working on the user's machine.\n\
1092 - The running Hematite build is ");
1093 sys.push_str(&crate::hematite_version_display());
1094 sys.push_str(".\n\
1095 - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1096 - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1097 - For simple questions, answer briefly in plain language.\n\
1098 - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1099 - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1100 - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1101 - Keep internal reasoning inside channel delimiters.\n\
1102 - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1103 <turn|>\n\n");
1104
1105 if let Some(history) = reasoning_history {
1106 if !history.is_empty() {
1107 sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1108 sys.push_str(history);
1109 sys.push_str("\n\n");
1110 }
1111 }
1112
1113 if brief {
1115 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1116 - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1117 - Depth: Surface-level verification only.\n\n");
1118 } else {
1119 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1120 - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1121 - Depth: Full multi-step derivation required.\n\n");
1122 }
1123
1124 let os = std::env::consts::OS;
1126 if professional {
1127 sys.push_str(&format!(
1128 "You are Hematite, a local coding system running on {}. \
1129 The TUI is one interface layer, not your whole identity. \
1130 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1131 Skip filler and keep the focus on the work.\n",
1132 os
1133 ));
1134 } else {
1135 sys.push_str(&format!(
1136 "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1137 The terminal UI is only one surface of the system. \
1138 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1139 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1140 self.species, snark, os
1141 ));
1142 }
1143
1144 let current_model = self.current_model();
1146 if !current_model.is_empty() {
1147 sys.push_str(&format!(
1148 "Loaded model: {} | Context window: {} tokens. \
1149 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1150 current_model, current_context_length
1151 ));
1152 if is_gemma4_model_name(¤t_model) {
1153 sys.push_str(
1154 "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1155 Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1156 For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1157 );
1158 }
1159 } else {
1160 sys.push_str(&format!(
1161 "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1162 current_context_length
1163 ));
1164 }
1165
1166 let shell_desc = if cfg!(target_os = "windows") {
1168 "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1169 - Use ONLY for builds, tests, or file migrations. \n\
1170 - You MUST use the `powershell` tool directly. \n\
1171 - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1172 } else {
1173 "[EXTERNAL SHELL]: `bash` (Unix).\n\
1174 - Use ONLY for builds, tests, or file migrations. \n\
1175 - NEVER wrap bash in other shells. \n\n"
1176 };
1177
1178 sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1179 [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1180 - These are the ONLY way to explore and modify code. \n\
1181 - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1182 sys.push_str(shell_desc);
1183
1184 sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1186 SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1187
1188 if brief {
1189 sys.push_str(
1190 "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1191 );
1192 }
1193
1194 sys.push_str("## CORE DIRECTIVES\n\
1195 1. REASONING: Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1196 2. CONCISENESS: After <think>, output ONE concise technical sentence or code block. Nothing else.\n\
1197 3. GROUNDEDNESS: Never invent tools, channels, or files. If a detail is not verified from tool output, say `uncertain`. Answer from stable Hematite capabilities unless repo implementation is requested.\n\n\
1198 ## ARCHITECTURAL DISCIPLINE\n\
1199 - HOST INSPECTION PRIORITY: MANDATORY. For all diagnostic questions (load, CPU/RAM, processes, toolchains, network, ports, OS config, log-checks), prefer `inspect_host` over raw `shell`. If `env_doctor` answers, do not follow with `path` unless requested.\n\
1200 - WORKFLOW PRIORITY: Prefer `run_workspace_workflow` for project builds/tests and `run_hematite_maintainer_workflow` for app-level scripts over raw `shell`.\n\
1201 - PROOF BEFORE ACTION: `read_file` or `inspect_lines` before editing. For files >200 lines, `grep_files` for a pattern BEFORE reading.\n\
1202 - PROOF BEFORE COMMIT: Do not `git_commit` until a successful `verify_build` exists for the latest changes.\n\
1203 - BUILT-IN FIRST: Prefer internal file tools over `mcp__filesystem__*` unless MCP is explicitly required.\n\n\
1204 ## TECHNIQUE & SAFETY\n\
1205 - SHELL DISCIPLINE: Risky `shell` calls REQUIRE a `reason` argument. Always use `powershell` on Windows; never `bash` or `/dev/null`.\n\
1206 - EDIT & TOOL PRECISION: Use unique lines/anchors for `edit_file`. Do not call tools like 'think' or 'reasoning'. Paths and symbols are NOT tool names.\n\
1207 - CONTEXT AWARENESS: Answer at the harness level (file ops, shell, build). Prefer real language examples (Python, C#, TS, Go). Never mention `mcp__*` tools unless active and relevant.\n\
1208 - TOOLING DISCIPLINE: Prefer `describe_toolchain` over improvising tool surface from memory; preserve its identifiers exactly.");
1209
1210 sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1212 2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1213 3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1214 4. Fix all errors before declaring success.\n\n\
1215 ## PRE-FLIGHT SCOPING PROTOCOL\n\
1216 Before attempting any multi-file task or complex refactor:\n\
1217 1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1218 2. Use `auto_pin_context` to keep those files in active context.\n\
1219 3. Only then proceed to deeper edits or research.\n\n\
1220 ## REFACTORING PROTOCOL\n\
1221 When modifying existing code or renaming symbols:\n\
1222 1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1223 2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1224 3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1225
1226 sys.push_str(&load_instruction_files());
1228
1229 sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1231
1232 if !tools.is_empty() {
1234 sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1235 for tool in tools {
1236 let schema = serde_json::to_string(&tool.function.parameters)
1237 .unwrap_or_else(|_| "{}".to_string());
1238 sys.push_str(&format!(
1239 "<|tool>declaration:{}{}{}<tool|>\n",
1240 tool.function.name, "{", schema
1241 ));
1242 sys.push_str(&format!("// {})\n", tool.function.description));
1243 }
1244 }
1245
1246 sys
1247 }
1248
1249 fn build_system_prompt_compact(
1250 &self,
1251 brief: bool,
1252 professional: bool,
1253 tools: &[ToolDefinition],
1254 ) -> String {
1255 let current_model = self.current_model();
1258 let current_context_length = self.current_context_length();
1259 let os = std::env::consts::OS;
1260
1261 let mut sys = String::from("<|turn>system\n<|think|>\n");
1262 sys.push_str(&format!(
1263 "You are Hematite {}, a local coding harness working on the user's machine.\n",
1264 crate::hematite_version_display()
1265 ));
1266 if professional {
1267 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1268 } else {
1269 sys.push_str(&format!(
1270 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1271 self.species
1272 ));
1273 }
1274 sys.push_str(&format!(
1275 "Model: {} | Context: {} tokens. Keep turns focused.\n",
1276 current_model, current_context_length
1277 ));
1278 if is_gemma4_model_name(¤t_model) {
1279 sys.push_str(
1280 "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1281 Raw regex patterns in grep_files, no slash delimiters.\n",
1282 );
1283 }
1284 if cfg!(target_os = "windows") {
1285 sys.push_str(&format!(
1286 "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1287 os
1288 ));
1289 } else {
1290 sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1291 }
1292 if brief {
1293 sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1294 }
1295
1296 sys.push_str(
1297 "\nCORE RULES:\n\
1298 - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1299 - Verify after edits: run `verify_build` after code changes, before committing.\n\
1300 - One tool at a time. Do not batch unrelated tool calls.\n\
1301 - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1302 - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1303 - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1304 );
1305
1306 if !tools.is_empty() {
1307 sys.push_str("\n# AVAILABLE TOOLS\n");
1308 for tool in tools {
1309 let desc: String = tool.function.description.chars().take(120).collect();
1310 sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1311 }
1312 }
1313
1314 sys.push_str("<turn|>\n");
1315 sys
1316 }
1317
1318 fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1319 let current_model = self.current_model();
1320 let current_context_length = self.current_context_length();
1321 let os = std::env::consts::OS;
1322 let mut sys = format!(
1323 "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1324 crate::hematite_version_display()
1325 );
1326 if professional {
1327 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1328 } else {
1329 sys.push_str(&format!(
1330 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1331 self.species
1332 ));
1333 }
1334 if !current_model.is_empty() {
1335 sys.push_str(&format!(
1336 "Loaded model: {} | Context window: {} tokens.\n",
1337 current_model, current_context_length
1338 ));
1339 } else {
1340 sys.push_str(&format!(
1341 "Context window: {} tokens.\n",
1342 current_context_length
1343 ));
1344 }
1345 sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1346 sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1347 sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1348 if cfg!(target_os = "windows") {
1349 sys.push_str(&format!(
1350 "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1351 os
1352 ));
1353 } else {
1354 sys.push_str(&format!(
1355 "You are running on {}. Use the native Unix shell conventions.\n",
1356 os
1357 ));
1358 }
1359 if brief {
1360 sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1361 }
1362 if is_gemma4_model_name(¤t_model) {
1363 sys.push_str(
1364 "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1365 );
1366 }
1367 sys.push_str("<turn|>\n");
1368 sys
1369 }
1370
1371 pub async fn call_with_tools(
1376 &self,
1377 messages: &[ChatMessage],
1378 tools: &[ToolDefinition],
1379 model_override: Option<&str>,
1381 ) -> Result<
1382 (
1383 Option<String>,
1384 Option<Vec<ToolCallResponse>>,
1385 Option<TokenUsage>,
1386 Option<String>,
1387 ),
1388 String,
1389 > {
1390 let _permit = self
1391 .kv_semaphore
1392 .acquire()
1393 .await
1394 .map_err(|e| e.to_string())?;
1395
1396 let current_model = self.current_model();
1397 let model = model_override.unwrap_or(current_model.as_str()).to_string();
1398 let filtered_tools = if cfg!(target_os = "windows") {
1399 tools
1400 .iter()
1401 .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1402 .cloned()
1403 .collect::<Vec<_>>()
1404 } else {
1405 tools.to_vec()
1406 };
1407
1408 let request_messages = if should_use_gemma_native_formatting(self, &model) {
1409 prepare_gemma_native_messages(messages)
1410 } else {
1411 messages.to_vec()
1412 };
1413
1414 const COMPACT_CORE_TOOLS: &[&str] = &[
1419 "read_file",
1420 "inspect_lines",
1421 "edit_file",
1422 "write_file",
1423 "grep_files",
1424 "list_files",
1425 "verify_build",
1426 "shell",
1427 ];
1428 let effective_tools = if is_compact_context_window(self.current_context_length()) {
1429 let core: Vec<_> = filtered_tools
1430 .iter()
1431 .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1432 .cloned()
1433 .collect();
1434 if core.is_empty() {
1435 None
1436 } else {
1437 Some(core)
1438 }
1439 } else if filtered_tools.is_empty() {
1440 None
1441 } else {
1442 Some(filtered_tools)
1443 };
1444
1445 let request = ChatRequest {
1446 model: model.clone(),
1447 messages: request_messages,
1448 temperature: 0.2,
1449 stream: false,
1450 tools: effective_tools,
1451 };
1452
1453 preflight_chat_request(
1455 &model,
1456 &request.messages,
1457 request.tools.as_deref().unwrap_or(&[]),
1458 self.current_context_length(),
1459 )?;
1460
1461 let mut last_err = String::new();
1462 let mut response_opt: Option<reqwest::Response> = None;
1463 for attempt in 0..3u32 {
1464 match self.client.post(&self.api_url).json(&request).send().await {
1465 Ok(res) if res.status().is_success() => {
1466 response_opt = Some(res);
1467 break;
1468 }
1469 Ok(res) if res.status().as_u16() >= 500 => {
1470 last_err = format!("LM Studio error {}", res.status());
1471 }
1472 Ok(res) => {
1473 let status = res.status();
1475 let body = res.text().await.unwrap_or_default();
1476 let preview = &body[..body.len().min(300)];
1477 return Err(format!("LM Studio error {}: {}", status, preview));
1478 }
1479 Err(e) if e.is_timeout() || e.is_connect() => {
1480 last_err = format!("Request failed: {}", e);
1481 }
1482 Err(e) => return Err(format!("Request failed: {}", e)),
1483 }
1484 if attempt < 2 {
1485 let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1486 tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1487 }
1488 }
1489 let res = response_opt
1490 .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1491
1492 let body: ChatResponse = res
1493 .json()
1494 .await
1495 .map_err(|e| format!("Response parse error: {}", e))?;
1496
1497 if let Some(usage) = &body.usage {
1498 let mut econ = self.economics.lock().unwrap();
1499 econ.input_tokens += usage.prompt_tokens;
1500 econ.output_tokens += usage.completion_tokens;
1501 }
1502
1503 let choice = body
1504 .choices
1505 .into_iter()
1506 .next()
1507 .ok_or_else(|| "Empty response from model".to_string())?;
1508
1509 let finish_reason = choice.finish_reason;
1510 let mut tool_calls = choice.message.tool_calls;
1511 let mut content = choice.message.content;
1512
1513 if let Some(raw_content) = &content {
1516 let native_calls = extract_native_tool_calls(raw_content);
1517 if !native_calls.is_empty() {
1518 let mut existing = tool_calls.unwrap_or_default();
1519 existing.extend(native_calls);
1520 tool_calls = Some(existing);
1521 let stripped = strip_native_tool_call_text(raw_content);
1522 content = if stripped.trim().is_empty() {
1523 None
1524 } else {
1525 Some(stripped)
1526 };
1527 }
1528 }
1529
1530 if is_gemma4_model_name(&model) {
1531 if let Some(calls) = tool_calls.as_mut() {
1532 for call in calls.iter_mut() {
1533 call.function.arguments = normalize_tool_argument_string(
1534 &call.function.name,
1535 &call.function.arguments,
1536 );
1537 }
1538 }
1539 }
1540
1541 let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1546 if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1547 && content
1548 .as_ref()
1549 .map(|s| s.trim().is_empty())
1550 .unwrap_or(true)
1551 && !reasoning_text.is_empty()
1552 {
1553 let recovered = extract_native_tool_calls(&reasoning_text);
1554 if !recovered.is_empty() {
1555 tool_calls = Some(recovered);
1556 content = None;
1558 }
1559 }
1560
1561 Ok((content, tool_calls, body.usage, finish_reason))
1562 }
1563
1564 pub async fn stream_messages(
1568 &self,
1569 messages: &[ChatMessage],
1570 tx: mpsc::Sender<InferenceEvent>,
1571 ) -> Result<(), Box<dyn std::error::Error>> {
1572 let current_model = self.current_model();
1573 let request_messages = if should_use_gemma_native_formatting(self, ¤t_model) {
1574 prepare_gemma_native_messages(messages)
1575 } else {
1576 messages
1577 .iter()
1578 .map(|m| {
1579 let mut clone = m.clone();
1580 let current_text = m.content.as_str();
1581 if !current_text.starts_with("<|turn>") {
1582 clone.content = MessageContent::Text(format!(
1583 "<|turn>{}\n{}\n<turn|>",
1584 m.role, current_text
1585 ));
1586 }
1587 clone
1588 })
1589 .collect()
1590 };
1591
1592 let request = ChatRequest {
1593 model: current_model.clone(),
1594 messages: request_messages,
1595 temperature: 0.7,
1596 stream: true,
1597 tools: None,
1598 };
1599
1600 if let Err(e) = preflight_chat_request(
1601 ¤t_model,
1602 &request.messages,
1603 &[],
1604 self.current_context_length(),
1605 ) {
1606 let tag = classify_runtime_failure_tag(&e);
1607 let _ = tx
1608 .send(InferenceEvent::ProviderStatus {
1609 state: provider_state_for_failure_tag(tag),
1610 summary: compact_runtime_failure_summary(tag, &e),
1611 })
1612 .await;
1613 let _ = tx
1614 .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1615 .await;
1616 let _ = tx.send(InferenceEvent::Done).await;
1617 return Ok(());
1618 }
1619
1620 let mut last_err = String::new();
1621 let mut response_opt: Option<reqwest::Response> = None;
1622 for attempt in 0..2u32 {
1623 match self.client.post(&self.api_url).json(&request).send().await {
1624 Ok(res) if res.status().is_success() => {
1625 response_opt = Some(res);
1626 break;
1627 }
1628 Ok(res) if res.status().as_u16() >= 500 => {
1629 last_err = format!("LM Studio error {}", res.status());
1630 }
1631 Ok(res) => {
1632 let status = res.status();
1633 let body = res.text().await.unwrap_or_default();
1634 let preview = &body[..body.len().min(300)];
1635 let detail = format!("LM Studio error {}: {}", status, preview);
1636 let tag = classify_runtime_failure_tag(&detail);
1637 let _ = tx
1638 .send(InferenceEvent::ProviderStatus {
1639 state: provider_state_for_failure_tag(tag),
1640 summary: compact_runtime_failure_summary(tag, &detail),
1641 })
1642 .await;
1643 let _ = tx
1644 .send(InferenceEvent::Error(format_runtime_failure_message(
1645 &detail,
1646 )))
1647 .await;
1648 let _ = tx.send(InferenceEvent::Done).await;
1649 return Ok(());
1650 }
1651 Err(e) if e.is_timeout() || e.is_connect() => {
1652 last_err = format!("Request failed: {}", e);
1653 }
1654 Err(e) => {
1655 let detail = format!("Request failed: {}", e);
1656 let tag = classify_runtime_failure_tag(&detail);
1657 let _ = tx
1658 .send(InferenceEvent::ProviderStatus {
1659 state: provider_state_for_failure_tag(tag),
1660 summary: compact_runtime_failure_summary(tag, &detail),
1661 })
1662 .await;
1663 let _ = tx
1664 .send(InferenceEvent::Error(format_runtime_failure_message(
1665 &detail,
1666 )))
1667 .await;
1668 let _ = tx.send(InferenceEvent::Done).await;
1669 return Ok(());
1670 }
1671 }
1672 if attempt < 1 {
1673 let _ = tx
1674 .send(InferenceEvent::ProviderStatus {
1675 state: ProviderRuntimeState::Recovering,
1676 summary: "LM Studio degraded during stream startup; retrying once.".into(),
1677 })
1678 .await;
1679 tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1680 }
1681 }
1682 let Some(res) = response_opt else {
1683 let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1684 let tag = classify_runtime_failure_tag(&detail);
1685 let _ = tx
1686 .send(InferenceEvent::ProviderStatus {
1687 state: provider_state_for_failure_tag(tag),
1688 summary: compact_runtime_failure_summary(tag, &detail),
1689 })
1690 .await;
1691 let _ = tx
1692 .send(InferenceEvent::Error(format_runtime_failure_message(
1693 &detail,
1694 )))
1695 .await;
1696 let _ = tx.send(InferenceEvent::Done).await;
1697 return Ok(());
1698 };
1699
1700 use futures::StreamExt;
1701 let mut byte_stream = res.bytes_stream();
1702
1703 let mut line_buffer = String::new();
1706 let mut content_buffer = String::new();
1707 let mut past_think = false;
1708 let mut emitted_any_content = false;
1709 let mut emitted_live_status = false;
1710
1711 loop {
1714 let next = tokio::select! {
1715 chunk = byte_stream.next() => chunk,
1717 _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1718 if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1719 break;
1720 }
1721 continue;
1722 }
1723 };
1724
1725 let Some(item) = next else { break };
1726
1727 let chunk = match item {
1728 Ok(chunk) => chunk,
1729 Err(e) => {
1730 let detail = format!("Request failed: {}", e);
1731 let tag = classify_runtime_failure_tag(&detail);
1732 let _ = tx
1733 .send(InferenceEvent::ProviderStatus {
1734 state: provider_state_for_failure_tag(tag),
1735 summary: compact_runtime_failure_summary(tag, &detail),
1736 })
1737 .await;
1738 let _ = tx
1739 .send(InferenceEvent::Error(format_runtime_failure_message(
1740 &detail,
1741 )))
1742 .await;
1743 let _ = tx.send(InferenceEvent::Done).await;
1744 return Ok(());
1745 }
1746 };
1747 line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1748
1749 while let Some(pos) = line_buffer.find("\n\n") {
1750 let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1751 let data_pos = match event_str.find("data: ") {
1752 Some(p) => p,
1753 None => continue,
1754 };
1755
1756 let data = event_str[data_pos + 6..].trim();
1757 if data == "[DONE]" {
1758 break;
1759 }
1760
1761 if let Ok(json) = serde_json::from_str::<Value>(data) {
1762 let delta = &json["choices"][0]["delta"];
1763
1764 if let Some(reasoning) = delta["reasoning_content"]
1766 .as_str()
1767 .or_else(|| delta["thought"].as_str())
1768 {
1769 if !reasoning.is_empty() {
1770 past_think = false; content_buffer.push_str(reasoning);
1772 if content_buffer.len() > 30
1773 && (reasoning.contains('\n') || reasoning.contains('.'))
1774 {
1775 let _ = tx
1776 .send(InferenceEvent::Thought(content_buffer.clone()))
1777 .await;
1778 emitted_any_content = true;
1779 content_buffer.clear();
1780 }
1781 }
1782 }
1783
1784 if let Some(content) = delta["content"].as_str() {
1786 if content.is_empty() {
1787 continue;
1788 }
1789
1790 if !past_think && !content_buffer.is_empty() && !content.trim().is_empty() {
1793 let _ = tx
1796 .send(InferenceEvent::Thought(content_buffer.clone()))
1797 .await;
1798 content_buffer.clear();
1799 past_think = true;
1800 }
1801
1802 if !past_think {
1803 let lc = content.to_lowercase();
1804 let close = lc
1805 .find("<channel|>")
1806 .map(|i| (i, "<channel|>".len()))
1807 .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1808
1809 if let Some((tag_start, tag_len)) = close {
1810 let before = &content[..tag_start];
1812 content_buffer.push_str(before);
1813 if !content_buffer.trim().is_empty() {
1814 let _ = tx
1815 .send(InferenceEvent::Thought(content_buffer.clone()))
1816 .await;
1817 emitted_any_content = true;
1818 }
1819 content_buffer.clear();
1820
1821 past_think = true;
1822 let after = content[tag_start + tag_len..].trim_start_matches('\n');
1823 content_buffer.push_str(after);
1824 } else {
1825 content_buffer.push_str(content);
1827 if content_buffer.len() > 30
1828 && (content.contains('\n') || content.contains('.'))
1829 {
1830 let _ = tx
1831 .send(InferenceEvent::Thought(content_buffer.clone()))
1832 .await;
1833 emitted_any_content = true;
1834 content_buffer.clear();
1835 }
1836 }
1837 } else {
1838 content_buffer.push_str(content);
1840 let is_boundary = content.contains(' ')
1841 || content.contains('.')
1842 || content.contains('!')
1843 || content.contains('?');
1844
1845 if content_buffer.len() > 10 && is_boundary {
1846 if !emitted_live_status {
1847 let _ = tx
1848 .send(InferenceEvent::ProviderStatus {
1849 state: ProviderRuntimeState::Live,
1850 summary: String::new(),
1851 })
1852 .await;
1853 emitted_live_status = true;
1854 }
1855 let _ =
1856 tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1857 emitted_any_content = true;
1858 content_buffer.clear();
1859 }
1860 }
1861 }
1862 }
1863 }
1864 }
1865
1866 if !content_buffer.is_empty() {
1868 if past_think {
1869 if !emitted_live_status {
1870 let _ = tx
1871 .send(InferenceEvent::ProviderStatus {
1872 state: ProviderRuntimeState::Live,
1873 summary: String::new(),
1874 })
1875 .await;
1876 }
1877 let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1878 } else {
1879 let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1880 }
1881 emitted_any_content = true;
1882 }
1883
1884 if !emitted_any_content {
1885 let _ = tx
1886 .send(InferenceEvent::ProviderStatus {
1887 state: ProviderRuntimeState::EmptyResponse,
1888 summary: compact_runtime_failure_summary(
1889 "empty_model_response",
1890 "Empty response from model",
1891 ),
1892 })
1893 .await;
1894 let _ = tx
1895 .send(InferenceEvent::Error(format_runtime_failure_message(
1896 "Empty response from model",
1897 )))
1898 .await;
1899 let _ = tx.send(InferenceEvent::Done).await;
1900 return Ok(());
1901 }
1902
1903 let _ = tx.send(InferenceEvent::Done).await;
1904 Ok(())
1905 }
1906
1907 pub async fn stream_generation(
1909 &self,
1910 prompt: &str,
1911 snark: u8,
1912 chaos: u8,
1913 brief: bool,
1914 professional: bool,
1915 tx: mpsc::Sender<InferenceEvent>,
1916 ) -> Result<(), Box<dyn std::error::Error>> {
1917 let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1918 let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1919 self.stream_messages(&messages, tx).await
1920 }
1921
1922 pub async fn generate_task_worker(
1926 &self,
1927 prompt: &str,
1928 professional: bool,
1929 ) -> Result<String, String> {
1930 let current_model = self.current_model();
1931 let model = self
1932 .worker_model
1933 .as_deref()
1934 .unwrap_or(current_model.as_str());
1935 self.generate_task_with_model(prompt, 0.1, professional, model)
1936 .await
1937 }
1938
1939 pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1940 self.generate_task_with_temp(prompt, 0.1, professional)
1941 .await
1942 }
1943
1944 pub async fn generate_task_with_temp(
1945 &self,
1946 prompt: &str,
1947 temp: f32,
1948 professional: bool,
1949 ) -> Result<String, String> {
1950 let current_model = self.current_model();
1951 self.generate_task_with_model(prompt, temp, professional, ¤t_model)
1952 .await
1953 }
1954
1955 pub async fn generate_task_with_model(
1956 &self,
1957 prompt: &str,
1958 temp: f32,
1959 professional: bool,
1960 model: &str,
1961 ) -> Result<String, String> {
1962 let _permit = self
1963 .kv_semaphore
1964 .acquire()
1965 .await
1966 .map_err(|e| e.to_string())?;
1967
1968 let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1969 let request_messages = if should_use_gemma_native_formatting(self, model) {
1970 prepare_gemma_native_messages(&[
1971 ChatMessage::system(&system),
1972 ChatMessage::user(prompt),
1973 ])
1974 } else {
1975 vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1976 };
1977 let request = ChatRequest {
1978 model: model.to_string(),
1979 messages: request_messages,
1980 temperature: temp,
1981 stream: false,
1982 tools: None,
1983 };
1984
1985 preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1986
1987 let res = self
1988 .client
1989 .post(&self.api_url)
1990 .json(&request)
1991 .send()
1992 .await
1993 .map_err(|e| format!("LM Studio request failed: {}", e))?;
1994
1995 let body: ChatResponse = res
1996 .json()
1997 .await
1998 .map_err(|e| format!("Failed to parse response: {}", e))?;
1999
2000 body.choices
2001 .first()
2002 .and_then(|c| c.message.content.clone())
2003 .ok_or_else(|| "Empty response from model".to_string())
2004 }
2005
2006 #[allow(dead_code)]
2010 pub fn snip_history(
2011 &self,
2012 turns: &[ChatMessage],
2013 max_tokens_estimate: usize,
2014 keep_recent: usize,
2015 ) -> Vec<ChatMessage> {
2016 let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
2017 if total_chars / 4 <= max_tokens_estimate {
2018 return turns.to_vec();
2019 }
2020 let keep = keep_recent.min(turns.len());
2021 let mut snipped = vec![turns[0].clone()];
2022 if turns.len() > keep + 1 {
2023 snipped.push(ChatMessage::system(&format!(
2024 "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2025 turns.len() - keep - 1
2026 )));
2027 snipped.extend_from_slice(&turns[turns.len() - keep..]);
2028 } else {
2029 snipped = turns.to_vec();
2030 }
2031 snipped
2032 }
2033}
2034
2035fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2036 serde_json::to_vec(value)
2037 .ok()
2038 .map_or(0, |bytes| bytes.len() / 4 + 1)
2039}
2040
2041const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2042
2043fn estimate_message_tokens(message: &ChatMessage) -> usize {
2044 let content_tokens = match &message.content {
2045 MessageContent::Text(s) => s.len() / 4 + 1,
2046 MessageContent::Parts(parts) => parts
2047 .iter()
2048 .map(|part| match part {
2049 ContentPart::Text { text } => text.len() / 4 + 1,
2050 ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2053 })
2054 .sum(),
2055 };
2056 let tool_tokens: usize = message
2057 .tool_calls
2058 .iter()
2059 .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2060 .sum();
2061 content_tokens + tool_tokens + 6
2062}
2063
2064pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2065 messages.iter().map(estimate_message_tokens).sum()
2066}
2067
2068fn reserved_output_tokens(context_length: usize) -> usize {
2069 let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2070 proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2071}
2072
2073pub fn estimate_prompt_pressure(
2074 messages: &[ChatMessage],
2075 tools: &[ToolDefinition],
2076 context_length: usize,
2077) -> (usize, usize, usize, u8) {
2078 let estimated_input_tokens =
2079 estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2080 let reserved_output = reserved_output_tokens(context_length);
2081 let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2082 let percent = if context_length == 0 {
2083 0
2084 } else {
2085 ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2086 };
2087 (
2088 estimated_input_tokens,
2089 reserved_output,
2090 estimated_total,
2091 percent,
2092 )
2093}
2094
2095fn preflight_chat_request(
2096 model: &str,
2097 messages: &[ChatMessage],
2098 tools: &[ToolDefinition],
2099 context_length: usize,
2100) -> Result<(), String> {
2101 let (estimated_input_tokens, reserved_output, estimated_total, _) =
2102 estimate_prompt_pressure(messages, tools, context_length);
2103
2104 if estimated_total > context_length {
2105 return Err(format!(
2106 "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2107 model, estimated_input_tokens, reserved_output, estimated_total, context_length
2108 ));
2109 }
2110
2111 Ok(())
2112}
2113
2114fn load_instruction_files() -> String {
2118 use std::collections::hash_map::DefaultHasher;
2119 use std::collections::HashSet;
2120 use std::hash::{Hash, Hasher};
2121
2122 let Ok(cwd) = std::env::current_dir() else {
2123 return String::new();
2124 };
2125 let mut result = String::new();
2126 let mut seen: HashSet<u64> = HashSet::new();
2127 let mut total_chars: usize = 0;
2128 const MAX_TOTAL: usize = 12_000;
2129 const MAX_PER_FILE: usize = 4_000;
2130
2131 let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2132
2133 let mut dir = cwd.clone();
2134 for _ in 0..4 {
2135 for name in &candidates {
2136 let path = dir.join(name);
2137 if !path.exists() {
2138 continue;
2139 }
2140 let Ok(content) = std::fs::read_to_string(&path) else {
2141 continue;
2142 };
2143 if content.trim().is_empty() {
2144 continue;
2145 }
2146
2147 let mut hasher = DefaultHasher::new();
2148 content.hash(&mut hasher);
2149 let h = hasher.finish();
2150 if !seen.insert(h) {
2151 continue;
2152 }
2153
2154 let truncated = if content.len() > MAX_PER_FILE {
2155 format!("{}...[truncated]", &content[..MAX_PER_FILE])
2156 } else {
2157 content
2158 };
2159
2160 if total_chars + truncated.len() > MAX_TOTAL {
2161 break;
2162 }
2163 total_chars += truncated.len();
2164 result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2165 }
2166 match dir.parent().map(|p| p.to_owned()) {
2167 Some(p) => dir = p,
2168 None => break,
2169 }
2170 }
2171
2172 if result.is_empty() {
2173 return String::new();
2174 }
2175 format!("\n\n# Project Instructions\n{}", result)
2176}
2177
2178pub fn extract_think_block(text: &str) -> Option<String> {
2179 let lower = text.to_lowercase();
2180
2181 let open_tag = "<|channel>thought";
2183 let close_tag = "<channel|>";
2184
2185 let start_pos = lower.find(open_tag)?;
2186 let content_start = start_pos + open_tag.len();
2187
2188 let close_pos = lower[content_start..]
2189 .find(close_tag)
2190 .map(|p| content_start + p)
2191 .unwrap_or(text.len());
2192
2193 let content = text[content_start..close_pos].trim();
2194 if content.is_empty() {
2195 None
2196 } else {
2197 Some(content.to_string())
2198 }
2199}
2200
2201pub fn strip_think_blocks(text: &str) -> String {
2202 let text = {
2206 let t = text.trim_start();
2207 if t.to_lowercase().starts_with("</think>") {
2208 &t[8..]
2209 } else {
2210 text
2211 }
2212 };
2213
2214 let lower = text.to_lowercase();
2215
2216 if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2218 let answer = text[end..]
2219 .replace("<|channel>thought", "")
2220 .replace("<channel|>", "");
2221 return answer.trim().replace("\n\n\n", "\n\n").to_string();
2222 }
2223
2224 let first_open = [
2226 lower.find("<|channel>thought"), lower.find("<think>"),
2228 lower.find("<thought>"),
2229 lower.find("<|think|>"),
2230 ]
2231 .iter()
2232 .filter_map(|&x| x)
2233 .min();
2234
2235 if let Some(start) = first_open {
2236 if start > 0 {
2237 return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2238 }
2239 return String::new();
2240 }
2241
2242 let naked_reasoning_phrases: &[&str] = &[
2246 "the user asked",
2247 "the user is asking",
2248 "the user wants",
2249 "i will structure",
2250 "i should provide",
2251 "i should give",
2252 "i should avoid",
2253 "i should note",
2254 "i should focus",
2255 "i should keep",
2256 "i should respond",
2257 "i should present",
2258 "i should display",
2259 "i should show",
2260 "i need to",
2261 "i can see from",
2262 "without being overly",
2263 "let me ",
2264 "necessary information in my identity",
2265 "was computed successfully",
2266 "computed successfully",
2267 ];
2268 let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2269 if is_naked_reasoning {
2270 let lines: Vec<&str> = text.lines().collect();
2271 if !lines.is_empty() {
2272 let mut start_idx = 0;
2275 for (i, line) in lines.iter().enumerate() {
2276 let l = line.to_lowercase();
2277 let is_reasoning_line =
2278 naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2279 if is_reasoning_line {
2280 start_idx = i + 1;
2281 } else {
2282 break;
2283 }
2284 }
2285 if start_idx < lines.len() {
2286 return lines[start_idx..]
2287 .join("\n")
2288 .trim()
2289 .replace("\n\n\n", "\n\n")
2290 .to_string();
2291 }
2292 return String::new();
2294 }
2295 }
2296
2297 let cleaned = strip_xml_tool_call_artifacts(text);
2300 cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2301}
2302
2303fn strip_xml_tool_call_artifacts(text: &str) -> String {
2306 const XML_ARTIFACTS: &[&str] = &[
2308 "</tool_call>",
2309 "<tool_call>",
2310 "</function>",
2311 "<function>",
2312 "</parameter>",
2313 "<parameter>",
2314 "</arguments>",
2315 "<arguments>",
2316 "</tool_use>",
2317 "<tool_use>",
2318 "</invoke>",
2319 "<invoke>",
2320 "</think>",
2322 "</thought>",
2323 "</thinking>",
2324 ];
2325 let mut out = text.to_string();
2326 for tag in XML_ARTIFACTS {
2327 while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2329 out.drain(pos..pos + tag.len());
2330 }
2331 }
2332 out
2334}
2335
2336pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2339 use regex::Regex;
2340 let mut results = Vec::new();
2341
2342 let re_call = Regex::new(
2344 r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2345 ).unwrap();
2346 let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2347
2348 for cap in re_call.captures_iter(text) {
2349 let name = cap[1].to_string();
2350 let args_str = &cap[2];
2351 let mut arguments = serde_json::Map::new();
2352
2353 for arg_cap in re_arg.captures_iter(args_str) {
2354 let key = arg_cap[1].to_string();
2355 let val_raw = arg_cap
2356 .get(2)
2357 .map(|m| m.as_str())
2358 .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2359 .unwrap_or("")
2360 .trim();
2361 let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2362
2363 let val = if normalized_raw == "true" {
2364 Value::Bool(true)
2365 } else if normalized_raw == "false" {
2366 Value::Bool(false)
2367 } else if let Ok(n) = normalized_raw.parse::<i64>() {
2368 Value::Number(n.into())
2369 } else if let Ok(n) = normalized_raw.parse::<u64>() {
2370 Value::Number(n.into())
2371 } else if let Ok(n) = normalized_raw.parse::<f64>() {
2372 serde_json::Number::from_f64(n)
2373 .map(Value::Number)
2374 .unwrap_or(Value::String(normalized_raw.clone()))
2375 } else {
2376 Value::String(normalized_raw)
2377 };
2378
2379 arguments.insert(key, val);
2380 }
2381
2382 results.push(ToolCallResponse {
2383 id: format!("call_{}", rand::random::<u32>()),
2384 call_type: "function".to_string(),
2385 function: ToolCallFn {
2386 name,
2387 arguments: Value::Object(arguments).to_string(),
2388 },
2389 });
2390 }
2391
2392 let re_xml_call = Regex::new(
2394 r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
2395 ).unwrap();
2396 let re_xml_param =
2397 Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
2398
2399 for cap in re_xml_call.captures_iter(text) {
2400 let name = cap[1].to_string();
2401 let body = &cap[2];
2402 let mut arguments = serde_json::Map::new();
2403
2404 for p_cap in re_xml_param.captures_iter(body) {
2405 let key = p_cap[1].to_string();
2406 let val_raw = p_cap[2].trim();
2407 let val = if val_raw == "true" {
2408 Value::Bool(true)
2409 } else if val_raw == "false" {
2410 Value::Bool(false)
2411 } else if let Ok(n) = val_raw.parse::<i64>() {
2412 Value::Number(n.into())
2413 } else if let Ok(n) = val_raw.parse::<u64>() {
2414 Value::Number(n.into())
2415 } else {
2416 Value::String(val_raw.to_string())
2417 };
2418 arguments.insert(key, val);
2419 }
2420
2421 if !arguments.is_empty() || name == "clear_context" {
2422 results.push(ToolCallResponse {
2423 id: format!("call_{}", rand::random::<u32>()),
2424 call_type: "function".to_string(),
2425 function: ToolCallFn {
2426 name,
2427 arguments: Value::Object(arguments).to_string(),
2428 },
2429 });
2430 }
2431 }
2432
2433 results
2434}
2435
2436pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2437 let trimmed = raw.trim();
2438 let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2439
2440 let mut value = match serde_json::from_str::<Value>(&candidate) {
2441 Ok(v) => v,
2442 Err(_) => return candidate,
2443 };
2444 normalize_tool_argument_value(tool_name, &mut value);
2445 value.to_string()
2446}
2447
2448fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2449 match value {
2450 Value::String(s) => *s = normalize_string_arg(s),
2451 Value::Array(items) => {
2452 for item in items {
2453 normalize_tool_argument_value(tool_name, item);
2454 }
2455 }
2456 Value::Object(map) => {
2457 for val in map.values_mut() {
2458 normalize_tool_argument_value(tool_name, val);
2459 }
2460 if tool_name == "grep_files" {
2461 if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2462 *pattern = normalize_regex_pattern(pattern);
2463 }
2464 }
2465 for key in ["path", "extension", "query", "command", "reason"] {
2466 if let Some(Value::String(s)) = map.get_mut(key) {
2467 *s = normalize_string_arg(s);
2468 }
2469 }
2470 }
2471 _ => {}
2472 }
2473}
2474
2475fn unwrap_json_string_once(input: &str) -> Option<String> {
2476 if input.len() < 2 {
2477 return None;
2478 }
2479 let first = input.chars().next()?;
2480 let last = input.chars().last()?;
2481 if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2482 return None;
2483 }
2484 let inner = &input[1..input.len() - 1];
2485 let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2486 Some(unescaped.trim().to_string())
2487}
2488
2489fn normalize_string_arg(input: &str) -> String {
2490 let mut out = input.trim().to_string();
2491 while out.len() >= 2 {
2492 let mut changed = false;
2493 for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2494 if out.starts_with(start) && out.ends_with(end) {
2495 out = out[start.len()..out.len() - end.len()].trim().to_string();
2496 changed = true;
2497 break;
2498 }
2499 }
2500 if !changed {
2501 break;
2502 }
2503 }
2504 out
2505}
2506
2507fn normalize_regex_pattern(input: &str) -> String {
2508 let out = normalize_string_arg(input);
2509 if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2510 out[1..out.len() - 1].to_string()
2511 } else {
2512 out
2513 }
2514}
2515
2516fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2517 let mut system_blocks = Vec::new();
2518 let mut prepared = Vec::new();
2519 let mut seeded = false;
2520
2521 for message in messages {
2522 if message.role == "system" {
2523 let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2524 .trim()
2525 .to_string();
2526 if !cleaned.is_empty() {
2527 system_blocks.push(cleaned);
2528 }
2529 continue;
2530 }
2531
2532 let mut clone = message.clone();
2533 clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2534
2535 if !seeded && message.role == "user" {
2536 let mut merged = String::new();
2537 if !system_blocks.is_empty() {
2538 merged.push_str("System instructions for this turn:\n");
2539 merged.push_str(&system_blocks.join("\n\n"));
2540 merged.push_str("\n\n");
2541 }
2542 merged.push_str(clone.content.as_str());
2543 clone.content = MessageContent::Text(merged);
2544 seeded = true;
2545 }
2546
2547 prepared.push(clone);
2548 }
2549
2550 if !seeded && !system_blocks.is_empty() {
2551 prepared.insert(
2552 0,
2553 ChatMessage::user(&format!(
2554 "System instructions for this turn:\n{}",
2555 system_blocks.join("\n\n")
2556 )),
2557 );
2558 }
2559
2560 prepared
2561}
2562
2563fn strip_legacy_turn_wrappers(text: &str) -> String {
2564 text.replace("<|turn>system\n", "")
2565 .replace("<|turn>user\n", "")
2566 .replace("<|turn>assistant\n", "")
2567 .replace("<|turn>tool\n", "")
2568 .replace("<turn|>", "")
2569 .trim()
2570 .to_string()
2571}
2572
2573pub fn strip_native_tool_call_text(text: &str) -> String {
2574 use regex::Regex;
2575 let re_call = Regex::new(
2577 r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2578 ).unwrap();
2579 let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
2581 let re_response =
2582 Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2583 .unwrap();
2584 let without_calls = re_call.replace_all(text, "");
2585 let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
2586 re_response
2587 .replace_all(without_xml.as_ref(), "")
2588 .trim()
2589 .to_string()
2590}
2591
2592#[cfg(test)]
2593mod tests {
2594 use super::*;
2595
2596 #[test]
2597 fn system_prompt_includes_running_hematite_version() {
2598 let engine = InferenceEngine::new(
2599 "http://localhost:1234/v1".to_string(),
2600 "strategist".to_string(),
2601 0,
2602 )
2603 .expect("engine");
2604
2605 let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2606 assert!(system.contains(crate::HEMATITE_VERSION));
2607 }
2608
2609 #[test]
2610 fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2611 let text = r#"<|channel>thought
2612Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2613<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2614
2615 let calls = extract_native_tool_calls(text);
2616 assert_eq!(calls.len(), 1);
2617 assert_eq!(calls[0].function.name, "read_file");
2618
2619 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2620 assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2621 assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2622 assert_eq!(
2623 args.get("path").and_then(|v| v.as_str()),
2624 Some("src/ui/tui.rs")
2625 );
2626
2627 let stripped = strip_native_tool_call_text(text);
2628 assert!(!stripped.contains("<|tool_call"));
2629 assert!(!stripped.contains("<tool_call|>"));
2630 }
2631
2632 #[test]
2633 fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2634 let text = r#"<|channel>thought
2635Planning.
2636<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2637Mapped src.
2638<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2639Read main.
2640<channel|>"#;
2641
2642 let calls = extract_native_tool_calls(text);
2643 assert_eq!(calls.len(), 2);
2644 assert_eq!(calls[0].function.name, "list_files");
2645 assert_eq!(calls[1].function.name, "read_file");
2646
2647 let stripped = strip_native_tool_call_text(text);
2648 assert!(!stripped.contains("<|tool_call"));
2649 assert!(!stripped.contains("<|tool_response"));
2650 assert!(!stripped.contains("<tool_response|>"));
2651 }
2652
2653 #[test]
2654 fn extracts_qwen_xml_tool_calls_from_reasoning() {
2655 let text = r#"Based on the project structure, I need to check the binary.
2656<tool_call>
2657<function=shell>
2658<parameter=command>
2659ls -la hematite.exe
2660</parameter>
2661<parameter=reason>
2662Check if the binary exists
2663</parameter>
2664</function>
2665</tool_call>"#;
2666
2667 let calls = extract_native_tool_calls(text);
2668 assert_eq!(calls.len(), 1);
2669 assert_eq!(calls[0].function.name, "shell");
2670
2671 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2672 assert_eq!(
2673 args.get("command").and_then(|v| v.as_str()),
2674 Some("ls -la hematite.exe")
2675 );
2676 assert_eq!(
2677 args.get("reason").and_then(|v| v.as_str()),
2678 Some("Check if the binary exists")
2679 );
2680
2681 let stripped = strip_native_tool_call_text(text);
2682 assert!(!stripped.contains("<tool_call>"));
2683 assert!(!stripped.contains("<function=shell>"));
2684 }
2685}