1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7pub struct InferenceEngine {
10 pub client: reqwest::Client,
11 pub api_url: String,
12 pub base_url: String,
15 pub species: String,
16 pub snark: u8,
17 pub kv_semaphore: Semaphore,
18 pub model: std::sync::RwLock<String>,
20 pub context_length: std::sync::atomic::AtomicUsize,
22 pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23 pub worker_model: Option<String>,
25 pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27 pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32 let lower = model.to_ascii_lowercase();
33 lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37 is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44 #[serde(rename = "type")]
45 pub tool_type: String,
46 pub function: ToolFunction,
47 #[serde(skip_serializing, skip_deserializing)]
48 pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53 pub name: String,
54 pub description: String,
55 pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60 RepoRead,
61 RepoWrite,
62 Runtime,
63 Architecture,
64 Toolchain,
65 Verification,
66 Git,
67 Research,
68 Vision,
69 Lsp,
70 Workflow,
71 External,
72 Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77 pub category: ToolCategory,
78 pub mutates_workspace: bool,
79 pub external_surface: bool,
80 pub trust_sensitive: bool,
81 pub read_only_friendly: bool,
82 pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86 if name.starts_with("mcp__") {
87 let lower = name.to_ascii_lowercase();
88 let mutates_workspace = [
89 "__edit",
90 "__write",
91 "__create",
92 "__move",
93 "__delete",
94 "__remove",
95 "__rename",
96 "__replace",
97 "__patch",
98 ]
99 .iter()
100 .any(|needle| lower.contains(needle));
101 return ToolMetadata {
102 category: ToolCategory::External,
103 mutates_workspace,
104 external_surface: true,
105 trust_sensitive: true,
106 read_only_friendly: !mutates_workspace,
107 plan_scope: false,
108 };
109 }
110
111 match name {
112 "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113 category: ToolCategory::RepoRead,
114 mutates_workspace: false,
115 external_surface: false,
116 trust_sensitive: false,
117 read_only_friendly: true,
118 plan_scope: true,
119 },
120 "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121 category: ToolCategory::RepoWrite,
122 mutates_workspace: true,
123 external_surface: false,
124 trust_sensitive: true,
125 read_only_friendly: false,
126 plan_scope: true,
127 },
128 "trace_runtime_flow" => ToolMetadata {
129 category: ToolCategory::Architecture,
130 mutates_workspace: false,
131 external_surface: false,
132 trust_sensitive: false,
133 read_only_friendly: true,
134 plan_scope: false,
135 },
136 "describe_toolchain" => ToolMetadata {
137 category: ToolCategory::Toolchain,
138 mutates_workspace: false,
139 external_surface: false,
140 trust_sensitive: false,
141 read_only_friendly: true,
142 plan_scope: false,
143 },
144 "shell" => ToolMetadata {
145 category: ToolCategory::Runtime,
146 mutates_workspace: true,
147 external_surface: false,
148 trust_sensitive: true,
149 read_only_friendly: false,
150 plan_scope: false,
151 },
152 "inspect_host" => ToolMetadata {
153 category: ToolCategory::Runtime,
154 mutates_workspace: false,
155 external_surface: false,
156 trust_sensitive: false,
157 read_only_friendly: true,
158 plan_scope: false,
159 },
160 "resolve_host_issue" => ToolMetadata {
161 category: ToolCategory::Runtime,
162 mutates_workspace: true,
163 external_surface: true,
164 trust_sensitive: true,
165 read_only_friendly: false,
166 plan_scope: false,
167 },
168 "run_hematite_maintainer_workflow" => ToolMetadata {
169 category: ToolCategory::Workflow,
170 mutates_workspace: true,
171 external_surface: false,
172 trust_sensitive: true,
173 read_only_friendly: false,
174 plan_scope: false,
175 },
176 "run_workspace_workflow" => ToolMetadata {
177 category: ToolCategory::Workflow,
178 mutates_workspace: true,
179 external_surface: false,
180 trust_sensitive: true,
181 read_only_friendly: false,
182 plan_scope: false,
183 },
184 "verify_build" => ToolMetadata {
185 category: ToolCategory::Verification,
186 mutates_workspace: false,
187 external_surface: false,
188 trust_sensitive: false,
189 read_only_friendly: true,
190 plan_scope: false,
191 },
192 "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
193 ToolMetadata {
194 category: ToolCategory::Git,
195 mutates_workspace: true,
196 external_surface: false,
197 trust_sensitive: true,
198 read_only_friendly: false,
199 plan_scope: false,
200 }
201 }
202 "research_web" | "fetch_docs" => ToolMetadata {
203 category: ToolCategory::Research,
204 mutates_workspace: false,
205 external_surface: false,
206 trust_sensitive: false,
207 read_only_friendly: true,
208 plan_scope: false,
209 },
210 "vision_analyze" => ToolMetadata {
211 category: ToolCategory::Vision,
212 mutates_workspace: false,
213 external_surface: false,
214 trust_sensitive: false,
215 read_only_friendly: true,
216 plan_scope: false,
217 },
218 "lsp_definitions"
219 | "lsp_references"
220 | "lsp_hover"
221 | "lsp_rename_symbol"
222 | "lsp_get_diagnostics"
223 | "lsp_search_symbol" => ToolMetadata {
224 category: ToolCategory::Lsp,
225 mutates_workspace: false,
226 external_surface: false,
227 trust_sensitive: false,
228 read_only_friendly: true,
229 plan_scope: false,
230 },
231 "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
232 category: ToolCategory::Workflow,
233 mutates_workspace: false,
234 external_surface: false,
235 trust_sensitive: false,
236 read_only_friendly: true,
237 plan_scope: true,
238 },
239 "manage_tasks" => ToolMetadata {
240 category: ToolCategory::Workflow,
241 mutates_workspace: false,
242 external_surface: false,
243 trust_sensitive: false,
244 read_only_friendly: true,
245 plan_scope: false,
246 },
247 _ => ToolMetadata {
248 category: ToolCategory::Other,
249 mutates_workspace: false,
250 external_surface: false,
251 trust_sensitive: false,
252 read_only_friendly: true,
253 plan_scope: false,
254 },
255 }
256}
257
258#[derive(Serialize, Deserialize, Clone, Debug)]
263pub struct ChatMessage {
264 pub role: String,
265 pub content: MessageContent,
267 #[serde(default, skip_serializing_if = "Vec::is_empty")]
269 pub tool_calls: Vec<ToolCallResponse>,
270 #[serde(skip_serializing_if = "Option::is_none")]
272 pub tool_call_id: Option<String>,
273 #[serde(skip_serializing_if = "Option::is_none")]
275 pub name: Option<String>,
276}
277
278#[derive(Serialize, Deserialize, Clone, Debug)]
279#[serde(untagged)]
280pub enum MessageContent {
281 Text(String),
282 Parts(Vec<ContentPart>),
283}
284
285#[derive(Serialize, Deserialize, Clone, Debug)]
286#[serde(tag = "type")]
287pub enum ContentPart {
288 #[serde(rename = "text")]
289 Text { text: String },
290 #[serde(rename = "image_url")]
291 ImageUrl { image_url: ImageUrlSource },
292}
293
294#[derive(Serialize, Deserialize, Clone, Debug)]
295pub struct ImageUrlSource {
296 pub url: String,
297}
298
299impl Default for MessageContent {
300 fn default() -> Self {
301 MessageContent::Text(String::new())
302 }
303}
304
305impl MessageContent {
306 pub fn as_str(&self) -> &str {
307 match self {
308 MessageContent::Text(s) => s,
309 MessageContent::Parts(parts) => {
310 for part in parts {
311 if let ContentPart::Text { text } = part {
312 return text;
313 }
314 }
315 ""
316 }
317 }
318 }
319}
320
321impl ChatMessage {
322 pub fn system(content: &str) -> Self {
323 Self {
324 role: "system".into(),
325 content: MessageContent::Text(content.into()),
326 tool_calls: Vec::new(),
327 tool_call_id: None,
328 name: None,
329 }
330 }
331 pub fn user(content: &str) -> Self {
332 Self {
333 role: "user".into(),
334 content: MessageContent::Text(content.into()),
335 tool_calls: Vec::new(),
336 tool_call_id: None,
337 name: None,
338 }
339 }
340 pub fn user_with_image(text: &str, image_url: &str) -> Self {
341 let mut text_parts = text.to_string();
342 if !text_parts.contains("<|image|>") {
343 text_parts.push_str(" <|image|>");
344 }
345 Self {
346 role: "user".into(),
347 content: MessageContent::Parts(vec![
348 ContentPart::Text { text: text_parts },
349 ContentPart::ImageUrl {
350 image_url: ImageUrlSource {
351 url: image_url.into(),
352 },
353 },
354 ]),
355 tool_calls: Vec::new(),
356 tool_call_id: None,
357 name: None,
358 }
359 }
360 pub fn assistant_text(content: &str) -> Self {
361 Self {
362 role: "assistant".into(),
363 content: MessageContent::Text(content.into()),
364 tool_calls: Vec::new(),
365 tool_call_id: None,
366 name: None,
367 }
368 }
369 pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
370 Self {
371 role: "assistant".into(),
372 content: MessageContent::Text(content.into()),
373 tool_calls: calls,
374 tool_call_id: None,
375 name: None,
376 }
377 }
378 pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
379 Self::tool_result_for_model(tool_call_id, fn_name, content, "")
380 }
381
382 pub fn tool_result_for_model(
385 tool_call_id: &str,
386 fn_name: &str,
387 content: &str,
388 model: &str,
389 ) -> Self {
390 let body = if is_gemma4_model_name(model) {
391 format!(
392 "<|tool_response>response:{}{}{}<tool_response|>",
393 fn_name, "{", content
394 )
395 } else {
396 content.to_string()
397 };
398 Self {
399 role: "tool".into(),
400 content: MessageContent::Text(body),
401 tool_calls: Vec::new(),
402 tool_call_id: Some(tool_call_id.into()),
403 name: Some(fn_name.into()),
404 }
405 }
406}
407
408#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallResponse {
412 pub id: String,
413 #[serde(rename = "type")]
414 pub call_type: String,
415 pub function: ToolCallFn,
416}
417
418#[derive(Serialize, Deserialize, Clone, Debug)]
419pub struct ToolCallFn {
420 pub name: String,
421 pub arguments: String,
423}
424
425#[derive(Serialize)]
428struct ChatRequest {
429 model: String,
430 messages: Vec<ChatMessage>,
431 temperature: f32,
432 stream: bool,
433 #[serde(skip_serializing_if = "Option::is_none")]
434 tools: Option<Vec<ToolDefinition>>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ChatResponse {
439 choices: Vec<ResponseChoice>,
440 usage: Option<TokenUsage>,
441}
442
443#[derive(Deserialize, Debug, Clone)]
444pub struct TokenUsage {
445 pub prompt_tokens: usize,
446 pub completion_tokens: usize,
447 pub total_tokens: usize,
448 #[serde(default)]
449 pub prompt_cache_hit_tokens: usize,
450 #[serde(default)]
451 pub cache_read_input_tokens: usize,
452}
453
454#[derive(Deserialize, Debug)]
455struct ResponseChoice {
456 message: ResponseMessage,
457 #[serde(default)]
458 finish_reason: Option<String>,
459}
460
461#[derive(Deserialize, Debug)]
462struct ResponseMessage {
463 content: Option<String>,
464 tool_calls: Option<Vec<ToolCallResponse>>,
465 #[serde(default)]
469 reasoning_content: Option<String>,
470}
471
472const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
473const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
474
475fn is_tiny_context_window(context_length: usize) -> bool {
476 context_length <= 8_192
477}
478
479fn is_compact_context_window(context_length: usize) -> bool {
480 context_length > 8_192 && context_length <= 49_152
481}
482
483pub fn is_compact_context_window_pub(context_length: usize) -> bool {
484 is_compact_context_window(context_length)
485}
486
487fn is_provider_context_limit_detail(lower: &str) -> bool {
488 (lower.contains("n_keep") && lower.contains("n_ctx"))
489 || lower.contains("context length")
490 || lower.contains("keep from the initial prompt")
491 || lower.contains("prompt is greater than the context length")
492 || lower.contains("exceeds the context window")
493}
494
495fn classify_runtime_failure_tag(detail: &str) -> &'static str {
496 let lower = detail.to_ascii_lowercase();
497 if lower.contains("context_window_blocked")
498 || lower.contains("context ceiling reached")
499 || lower.contains("exceeds the")
500 || is_provider_context_limit_detail(&lower)
501 {
502 "context_window"
503 } else if lower.contains("empty response from model")
504 || lower.contains("model returned an empty response")
505 {
506 "empty_model_response"
507 } else if lower.contains("action blocked:")
508 || lower.contains("access denied")
509 || lower.contains("declined by user")
510 {
511 "tool_policy_blocked"
512 } else {
513 "provider_degraded"
514 }
515}
516
517fn runtime_failure_guidance(tag: &str) -> &'static str {
518 match tag {
519 "context_window" => {
520 "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
521 }
522 "empty_model_response" => {
523 "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
524 }
525 "tool_policy_blocked" => {
526 "Stay inside the allowed workflow or switch modes before retrying."
527 }
528 _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
529 }
530}
531
532fn format_runtime_failure_message(detail: &str) -> String {
533 let tag = classify_runtime_failure_tag(detail);
534 format!(
535 "[failure:{}] {} Detail: {}",
536 tag,
537 runtime_failure_guidance(tag),
538 detail.trim()
539 )
540}
541
542#[derive(Debug, Clone, Copy, PartialEq, Eq)]
543pub enum ProviderRuntimeState {
544 Booting,
545 Live,
546 Recovering,
547 Degraded,
548 ContextWindow,
549 EmptyResponse,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum McpRuntimeState {
554 Unconfigured,
555 Healthy,
556 Degraded,
557 Failed,
558}
559
560#[derive(Debug, Clone, Copy, PartialEq, Eq)]
561pub enum OperatorCheckpointState {
562 Idle,
563 RecoveringProvider,
564 BudgetReduced,
565 HistoryCompacted,
566 BlockedContextWindow,
567 BlockedPolicy,
568 BlockedRecentFileEvidence,
569 BlockedExactLineWindow,
570 BlockedToolLoop,
571 BlockedVerification,
572}
573
574impl OperatorCheckpointState {
575 pub fn label(self) -> &'static str {
576 match self {
577 OperatorCheckpointState::Idle => "idle",
578 OperatorCheckpointState::RecoveringProvider => "recovering_provider",
579 OperatorCheckpointState::BudgetReduced => "budget_reduced",
580 OperatorCheckpointState::HistoryCompacted => "history_compacted",
581 OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
582 OperatorCheckpointState::BlockedPolicy => "blocked_policy",
583 OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
584 OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
585 OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
586 OperatorCheckpointState::BlockedVerification => "blocked_verification",
587 }
588 }
589}
590
591fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
592 match tag {
593 "context_window" => ProviderRuntimeState::ContextWindow,
594 "empty_model_response" => ProviderRuntimeState::EmptyResponse,
595 _ => ProviderRuntimeState::Degraded,
596 }
597}
598
599fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
600 match tag {
601 "context_window" => {
602 "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
603 .to_string()
604 }
605 "empty_model_response" => {
606 "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
607 .to_string()
608 }
609 "tool_policy_blocked" => {
610 "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
611 .to_string()
612 }
613 _ => {
614 let mut excerpt = detail
615 .split_whitespace()
616 .take(12)
617 .collect::<Vec<_>>()
618 .join(" ");
619 if excerpt.len() > 110 {
620 excerpt.truncate(110);
621 excerpt.push_str("...");
622 }
623 if excerpt.is_empty() {
624 "LM Studio degraded; Hematite will retry once before surfacing a failure."
625 .to_string()
626 } else {
627 format!("LM Studio degraded: {}", excerpt)
628 }
629 }
630 }
631}
632
633#[derive(Debug)]
636pub enum InferenceEvent {
637 Token(String),
639 MutedToken(String),
641 Thought(String),
643 VoiceStatus(String),
645 ToolCallStart {
647 id: String,
648 name: String,
649 args: String,
650 },
651 ToolCallResult {
653 id: String,
654 name: String,
655 output: String,
656 is_error: bool,
657 },
658 ApprovalRequired {
662 id: String,
663 name: String,
664 display: String,
665 diff: Option<String>,
668 mutation_label: Option<String>,
670 responder: tokio::sync::oneshot::Sender<bool>,
671 },
672 Done,
674 Error(String),
676 ProviderStatus {
678 state: ProviderRuntimeState,
679 summary: String,
680 },
681 OperatorCheckpoint {
683 state: OperatorCheckpointState,
684 summary: String,
685 },
686 RecoveryRecipe { summary: String },
688 McpStatus {
690 state: McpRuntimeState,
691 summary: String,
692 },
693 CompactionPressure {
695 estimated_tokens: usize,
696 threshold_tokens: usize,
697 percent: u8,
698 },
699 PromptPressure {
701 estimated_input_tokens: usize,
702 reserved_output_tokens: usize,
703 estimated_total_tokens: usize,
704 context_length: usize,
705 percent: u8,
706 },
707 TaskProgress {
709 id: String,
710 label: String,
711 progress: u8,
712 },
713 UsageUpdate(TokenUsage),
715 RuntimeProfile {
717 model_id: String,
718 context_length: usize,
719 },
720 VeinStatus {
722 file_count: usize,
723 embedded_count: usize,
724 docs_only: bool,
725 },
726 VeinContext { paths: Vec<String> },
729 SoulReroll {
731 species: String,
732 rarity: String,
733 shiny: bool,
734 personality: String,
735 },
736 CopyDiveInCommand(String),
738 EmbedProfile { model_id: Option<String> },
740 ShellLine(String),
744}
745
746impl InferenceEngine {
749 pub fn new(
750 api_url: String,
751 species: String,
752 snark: u8,
753 ) -> Result<Self, Box<dyn std::error::Error>> {
754 let client = reqwest::Client::builder()
755 .timeout(std::time::Duration::from_secs(180))
756 .build()?;
757
758 let base_url = {
760 let trimmed = api_url.trim_end_matches('/');
761 if let Some(scheme_end) = trimmed.find("://") {
762 let after_scheme = &trimmed[scheme_end + 3..];
763 if let Some(path_start) = after_scheme.find('/') {
764 format!(
765 "{}://{}",
766 &trimmed[..scheme_end],
767 &after_scheme[..path_start]
768 )
769 } else {
770 trimmed.to_string()
771 }
772 } else {
773 trimmed.to_string()
774 }
775 };
776
777 let api_url = if api_url.ends_with("/chat/completions") {
778 api_url
779 } else if api_url.ends_with("/") {
780 format!("{}chat/completions", api_url)
781 } else {
782 format!("{}/chat/completions", api_url)
783 };
784
785 Ok(Self {
786 client,
787 api_url,
788 base_url,
789 species,
790 snark,
791 kv_semaphore: Semaphore::new(3),
792 model: std::sync::RwLock::new(String::new()),
793 context_length: std::sync::atomic::AtomicUsize::new(32_768), economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
795 worker_model: None,
796 gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
797 cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
798 })
799 }
800
801 pub fn set_gemma_native_formatting(&self, enabled: bool) {
802 self.gemma_native_formatting
803 .store(enabled, std::sync::atomic::Ordering::SeqCst);
804 }
805
806 pub fn gemma_native_formatting_enabled(&self) -> bool {
807 self.gemma_native_formatting
808 .load(std::sync::atomic::Ordering::SeqCst)
809 }
810
811 pub fn current_model(&self) -> String {
812 self.model.read().map(|g| g.clone()).unwrap_or_default()
813 }
814
815 pub fn current_context_length(&self) -> usize {
816 self.context_length
817 .load(std::sync::atomic::Ordering::SeqCst)
818 }
819
820 pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
821 if let Ok(mut guard) = self.model.write() {
822 *guard = model.to_string();
823 }
824 self.context_length
825 .store(context_length, std::sync::atomic::Ordering::SeqCst);
826 }
827
828 pub async fn health_check(&self) -> bool {
830 let url = format!("{}/v1/models", self.base_url);
831 match self.client.get(&url).send().await {
832 Ok(resp) => resp.status().is_success(),
833 Err(_) => false,
834 }
835 }
836
837 pub async fn get_loaded_model(&self) -> Option<String> {
845 #[derive(Deserialize)]
846 struct ModelList {
847 data: Vec<ModelEntry>,
848 }
849 #[derive(Deserialize)]
850 struct ModelEntry {
851 id: String,
852 #[serde(rename = "type", default)]
853 model_type: String,
854 #[serde(default)]
855 state: String,
856 }
857
858 if let Ok(resp) = self
860 .client
861 .get(format!("{}/api/v0/models", self.base_url))
862 .send()
863 .await
864 {
865 if let Ok(list) = resp.json::<ModelList>().await {
866 let chat_model = list
867 .data
868 .into_iter()
869 .find(|m| m.model_type != "embeddings" && m.state == "loaded")
870 .map(|m| m.id)
871 .unwrap_or_default();
872 return Some(chat_model);
873 }
874 }
875
876 let resp = self
878 .client
879 .get(format!("{}/v1/models", self.base_url))
880 .send()
881 .await
882 .ok()?;
883 let list: ModelList = resp.json().await.ok()?;
884 Some(
885 list.data
886 .into_iter()
887 .find(|m| !m.id.to_lowercase().contains("embed"))
888 .map(|m| m.id)
889 .unwrap_or_default(),
890 )
891 }
892
893 pub async fn get_embedding_model(&self) -> Option<String> {
899 #[derive(Deserialize)]
900 struct ModelList {
901 data: Vec<ModelEntry>,
902 }
903 #[derive(Deserialize)]
904 struct ModelEntry {
905 id: String,
906 #[serde(rename = "type", default)]
907 model_type: String,
908 #[serde(default)]
909 state: String,
910 }
911 let resp = self
912 .client
913 .get(format!("{}/api/v0/models", self.base_url))
914 .send()
915 .await
916 .ok()?;
917 let list: ModelList = resp.json().await.ok()?;
918 list.data
919 .into_iter()
920 .find(|m| m.model_type == "embeddings" && m.state == "loaded")
921 .map(|m| m.id)
922 }
923
924 pub async fn detect_context_length(&self) -> usize {
930 #[derive(Deserialize)]
931 struct LmStudioModel {
932 id: Option<String>,
933 #[serde(rename = "type", default)]
934 model_type: String,
935 state: Option<String>,
936 loaded_context_length: Option<u64>,
937 context_length: Option<u64>,
938 max_context_length: Option<u64>,
939 }
940 #[derive(Deserialize)]
941 struct LmStudioList {
942 data: Vec<LmStudioModel>,
943 }
944
945 if let Ok(resp) = self
947 .client
948 .get(format!("{}/api/v0/models", self.base_url))
949 .send()
950 .await
951 {
952 if let Ok(list) = resp.json::<LmStudioList>().await {
953 let target_model = self.current_model().to_ascii_lowercase();
954 let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
956 let loaded = list
957 .data
958 .iter()
959 .find(|m| {
960 non_embed(m)
961 && m.state.as_deref() == Some("loaded")
962 && m.id
963 .as_deref()
964 .map(|id| id.eq_ignore_ascii_case(&target_model))
965 .unwrap_or(false)
966 })
967 .or_else(|| {
968 list.data
969 .iter()
970 .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
971 })
972 .or_else(|| {
973 list.data.iter().find(|m| {
974 non_embed(m)
975 && m.id
976 .as_deref()
977 .map(|id| id.eq_ignore_ascii_case(&target_model))
978 .unwrap_or(false)
979 })
980 })
981 .or_else(|| list.data.iter().find(|m| non_embed(m)));
982
983 if let Some(model) = loaded {
984 if let Some(ctx) = model.loaded_context_length {
985 if ctx > 0 {
986 return ctx as usize;
987 }
988 }
989 if let Some(ctx) = model.context_length {
990 if ctx > 0 {
991 return ctx as usize;
992 }
993 }
994 if let Some(ctx) = model.max_context_length {
995 if ctx > 0 && ctx <= 32_768 {
996 return ctx as usize;
997 }
998 }
999 }
1000 }
1001 }
1002
1003 if self.current_model().to_lowercase().contains("gemma-4") {
1007 return 32_768;
1008 }
1009
1010 32_768
1011 }
1012
1013 pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
1014 let previous_model = self.current_model();
1015 let previous_context = self.current_context_length();
1016
1017 let detected_model = match self.get_loaded_model().await {
1018 Some(m) if !m.is_empty() => m, Some(_) => "no model loaded".to_string(), None => previous_model.clone(), };
1022
1023 if !detected_model.is_empty() && detected_model != previous_model {
1024 if let Ok(mut guard) = self.model.write() {
1025 *guard = detected_model.clone();
1026 }
1027 }
1028
1029 let detected_context = self.detect_context_length().await;
1030 let effective_model = if detected_model.is_empty() {
1031 previous_model.clone()
1032 } else {
1033 detected_model
1034 };
1035
1036 let changed = effective_model != previous_model || detected_context != previous_context;
1037 self.set_runtime_profile(&effective_model, detected_context);
1038
1039 Some((effective_model, detected_context, changed))
1040 }
1041
1042 pub fn build_system_prompt(
1043 &self,
1044 snark: u8,
1045 chaos: u8,
1046 brief: bool,
1047 professional: bool,
1048 tools: &[ToolDefinition],
1049 reasoning_history: Option<&str>,
1050 mcp_tools: &[crate::agent::mcp::McpTool],
1051 ) -> String {
1052 let mut sys = self.build_system_prompt_legacy(
1053 snark,
1054 chaos,
1055 brief,
1056 professional,
1057 tools,
1058 reasoning_history,
1059 );
1060
1061 if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1062 sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1063 sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1064 for tool in mcp_tools {
1065 let description = tool
1066 .description
1067 .as_deref()
1068 .unwrap_or("No description provided.");
1069 sys.push_str(&format!("- {}: {}\n", tool.name, description));
1070 }
1071 }
1072
1073 sys
1074 }
1075
1076 pub fn build_system_prompt_legacy(
1077 &self,
1078 snark: u8,
1079 _chaos: u8,
1080 brief: bool,
1081 professional: bool,
1082 tools: &[ToolDefinition],
1083 reasoning_history: Option<&str>,
1084 ) -> String {
1085 let current_context_length = self.current_context_length();
1086 if is_tiny_context_window(current_context_length) {
1087 return self.build_system_prompt_tiny(brief, professional);
1088 }
1089 if is_compact_context_window(current_context_length) {
1090 return self.build_system_prompt_compact(brief, professional, tools);
1091 }
1092
1093 let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1095 - You are Hematite, a local coding system working on the user's machine.\n\
1096 - The running Hematite build is ");
1097 sys.push_str(&crate::hematite_version_display());
1098 sys.push_str(".\n\
1099 - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1100 - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1101 - For simple questions, answer briefly in plain language.\n\
1102 - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1103 - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1104 - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1105 - Keep internal reasoning inside channel delimiters.\n\
1106 - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1107 <turn|>\n\n");
1108
1109 if let Some(history) = reasoning_history {
1110 if !history.is_empty() {
1111 sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1112 sys.push_str(history);
1113 sys.push_str("\n\n");
1114 }
1115 }
1116
1117 if brief {
1119 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1120 - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1121 - Depth: Surface-level verification only.\n\n");
1122 } else {
1123 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1124 - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1125 - Depth: Full multi-step derivation required.\n\n");
1126 }
1127
1128 let os = std::env::consts::OS;
1130 if professional {
1131 sys.push_str(&format!(
1132 "You are Hematite, a local coding system running on {}. \
1133 The TUI is one interface layer, not your whole identity. \
1134 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1135 Skip filler and keep the focus on the work.\n",
1136 os
1137 ));
1138 } else {
1139 sys.push_str(&format!(
1140 "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1141 The terminal UI is only one surface of the system. \
1142 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1143 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1144 self.species, snark, os
1145 ));
1146 }
1147
1148 let current_model = self.current_model();
1150 if !current_model.is_empty() {
1151 sys.push_str(&format!(
1152 "Loaded model: {} | Context window: {} tokens. \
1153 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1154 current_model, current_context_length
1155 ));
1156 if is_gemma4_model_name(¤t_model) {
1157 sys.push_str(
1158 "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1159 Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1160 For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1161 );
1162 }
1163 } else {
1164 sys.push_str(&format!(
1165 "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1166 current_context_length
1167 ));
1168 }
1169
1170 let shell_desc = if cfg!(target_os = "windows") {
1172 "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1173 - Use ONLY for builds, tests, or file migrations. \n\
1174 - You MUST use the `powershell` tool directly. \n\
1175 - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1176 } else {
1177 "[EXTERNAL SHELL]: `bash` (Unix).\n\
1178 - Use ONLY for builds, tests, or file migrations. \n\
1179 - NEVER wrap bash in other shells. \n\n"
1180 };
1181
1182 sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1183 [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1184 - These are the ONLY way to explore and modify code. \n\
1185 - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1186 sys.push_str(shell_desc);
1187
1188 sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1190 SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1191
1192 if brief {
1193 sys.push_str(
1194 "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1195 );
1196 }
1197
1198 sys.push_str("## CORE DIRECTIVES\n\
1199 1. REASONING: Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1200 2. CONCISENESS: After <think>, output ONE concise technical sentence or code block. Nothing else.\n\
1201 3. GROUNDEDNESS: Never invent tools, channels, or files. If a detail is not verified from tool output, say `uncertain`. Answer from stable Hematite capabilities unless repo implementation is requested.\n\n\
1202 ## ARCHITECTURAL DISCIPLINE\n\
1203 - HOST INSPECTION PRIORITY: MANDATORY. For all diagnostic questions (load, CPU/RAM, processes, toolchains, network, ports, OS config, log-checks), prefer `inspect_host` over raw `shell`. If `env_doctor` answers, do not follow with `path` unless requested.\n\
1204 - WORKFLOW PRIORITY: Prefer `run_workspace_workflow` for project builds/tests and `run_hematite_maintainer_workflow` for app-level scripts over raw `shell`.\n\
1205 - PROOF BEFORE ACTION: `read_file` or `inspect_lines` before editing. For files >200 lines, `grep_files` for a pattern BEFORE reading.\n\
1206 - PROOF BEFORE COMMIT: Do not `git_commit` until a successful `verify_build` exists for the latest changes.\n\
1207 - BUILT-IN FIRST: Prefer internal file tools over `mcp__filesystem__*` unless MCP is explicitly required.\n\n\
1208 ## TECHNIQUE & SAFETY\n\
1209 - SHELL DISCIPLINE: Risky `shell` calls REQUIRE a `reason` argument. Always use `powershell` on Windows; never `bash` or `/dev/null`.\n\
1210 - EDIT & TOOL PRECISION: Use unique lines/anchors for `edit_file`. Do not call tools like 'think' or 'reasoning'. Paths and symbols are NOT tool names.\n\
1211 - CONTEXT AWARENESS: Answer at the harness level (file ops, shell, build). Prefer real language examples (Python, C#, TS, Go). Never mention `mcp__*` tools unless active and relevant.\n\
1212 - TOOLING DISCIPLINE: Prefer `describe_toolchain` over improvising tool surface from memory; preserve its identifiers exactly.");
1213
1214 sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1216 2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1217 3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1218 4. Fix all errors before declaring success.\n\n\
1219 ## PRE-FLIGHT SCOPING PROTOCOL\n\
1220 Before attempting any multi-file task or complex refactor:\n\
1221 1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1222 2. Use `auto_pin_context` to keep those files in active context.\n\
1223 3. Only then proceed to deeper edits or research.\n\n\
1224 ## REFACTORING PROTOCOL\n\
1225 When modifying existing code or renaming symbols:\n\
1226 1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1227 2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1228 3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1229
1230 sys.push_str(&load_instruction_files());
1232
1233 sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1235
1236 if !tools.is_empty() {
1238 sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1239 for tool in tools {
1240 let schema = serde_json::to_string(&tool.function.parameters)
1241 .unwrap_or_else(|_| "{}".to_string());
1242 sys.push_str(&format!(
1243 "<|tool>declaration:{}{}{}<tool|>\n",
1244 tool.function.name, "{", schema
1245 ));
1246 sys.push_str(&format!("// {})\n", tool.function.description));
1247 }
1248 }
1249
1250 sys
1251 }
1252
1253 fn build_system_prompt_compact(
1254 &self,
1255 brief: bool,
1256 professional: bool,
1257 tools: &[ToolDefinition],
1258 ) -> String {
1259 let current_model = self.current_model();
1262 let current_context_length = self.current_context_length();
1263 let os = std::env::consts::OS;
1264
1265 let mut sys = String::from("<|turn>system\n<|think|>\n");
1266 sys.push_str(&format!(
1267 "You are Hematite {}, a local coding harness working on the user's machine.\n",
1268 crate::hematite_version_display()
1269 ));
1270 if professional {
1271 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1272 } else {
1273 sys.push_str(&format!(
1274 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1275 self.species
1276 ));
1277 }
1278 sys.push_str(&format!(
1279 "Model: {} | Context: {} tokens. Keep turns focused.\n",
1280 current_model, current_context_length
1281 ));
1282 if is_gemma4_model_name(¤t_model) {
1283 sys.push_str(
1284 "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1285 Raw regex patterns in grep_files, no slash delimiters.\n",
1286 );
1287 }
1288 if cfg!(target_os = "windows") {
1289 sys.push_str(&format!(
1290 "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1291 os
1292 ));
1293 } else {
1294 sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1295 }
1296 if brief {
1297 sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1298 }
1299
1300 sys.push_str(
1301 "\nCORE RULES:\n\
1302 - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1303 - Verify after edits: run `verify_build` after code changes, before committing.\n\
1304 - One tool at a time. Do not batch unrelated tool calls.\n\
1305 - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1306 - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1307 - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1308 );
1309
1310 if !tools.is_empty() {
1311 sys.push_str("\n# AVAILABLE TOOLS\n");
1312 for tool in tools {
1313 let desc: String = tool.function.description.chars().take(120).collect();
1314 sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1315 }
1316 }
1317
1318 sys.push_str("<turn|>\n");
1319 sys
1320 }
1321
1322 fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1323 let current_model = self.current_model();
1324 let current_context_length = self.current_context_length();
1325 let os = std::env::consts::OS;
1326 let mut sys = format!(
1327 "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1328 crate::hematite_version_display()
1329 );
1330 if professional {
1331 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1332 } else {
1333 sys.push_str(&format!(
1334 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1335 self.species
1336 ));
1337 }
1338 if !current_model.is_empty() {
1339 sys.push_str(&format!(
1340 "Loaded model: {} | Context window: {} tokens.\n",
1341 current_model, current_context_length
1342 ));
1343 } else {
1344 sys.push_str(&format!(
1345 "Context window: {} tokens.\n",
1346 current_context_length
1347 ));
1348 }
1349 sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1350 sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1351 sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1352 if cfg!(target_os = "windows") {
1353 sys.push_str(&format!(
1354 "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1355 os
1356 ));
1357 } else {
1358 sys.push_str(&format!(
1359 "You are running on {}. Use the native Unix shell conventions.\n",
1360 os
1361 ));
1362 }
1363 if brief {
1364 sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1365 }
1366 if is_gemma4_model_name(¤t_model) {
1367 sys.push_str(
1368 "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1369 );
1370 }
1371 sys.push_str("<turn|>\n");
1372 sys
1373 }
1374
1375 pub async fn call_with_tools(
1380 &self,
1381 messages: &[ChatMessage],
1382 tools: &[ToolDefinition],
1383 model_override: Option<&str>,
1385 ) -> Result<
1386 (
1387 Option<String>,
1388 Option<Vec<ToolCallResponse>>,
1389 Option<TokenUsage>,
1390 Option<String>,
1391 ),
1392 String,
1393 > {
1394 let _permit = self
1395 .kv_semaphore
1396 .acquire()
1397 .await
1398 .map_err(|e| e.to_string())?;
1399
1400 let current_model = self.current_model();
1401 let model = model_override.unwrap_or(current_model.as_str()).to_string();
1402 let filtered_tools = if cfg!(target_os = "windows") {
1403 tools
1404 .iter()
1405 .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1406 .cloned()
1407 .collect::<Vec<_>>()
1408 } else {
1409 tools.to_vec()
1410 };
1411
1412 let request_messages = if should_use_gemma_native_formatting(self, &model) {
1413 prepare_gemma_native_messages(messages)
1414 } else {
1415 messages.to_vec()
1416 };
1417
1418 const COMPACT_CORE_TOOLS: &[&str] = &[
1423 "read_file",
1424 "inspect_lines",
1425 "edit_file",
1426 "write_file",
1427 "grep_files",
1428 "list_files",
1429 "verify_build",
1430 "shell",
1431 ];
1432 let effective_tools = if is_compact_context_window(self.current_context_length()) {
1433 let core: Vec<_> = filtered_tools
1434 .iter()
1435 .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1436 .cloned()
1437 .collect();
1438 if core.is_empty() {
1439 None
1440 } else {
1441 Some(core)
1442 }
1443 } else if filtered_tools.is_empty() {
1444 None
1445 } else {
1446 Some(filtered_tools)
1447 };
1448
1449 let request = ChatRequest {
1450 model: model.clone(),
1451 messages: request_messages,
1452 temperature: 0.2,
1453 stream: false,
1454 tools: effective_tools,
1455 };
1456
1457 preflight_chat_request(
1459 &model,
1460 &request.messages,
1461 request.tools.as_deref().unwrap_or(&[]),
1462 self.current_context_length(),
1463 )?;
1464
1465 let mut last_err = String::new();
1466 let mut response_opt: Option<reqwest::Response> = None;
1467 for attempt in 0..3u32 {
1468 match self.client.post(&self.api_url).json(&request).send().await {
1469 Ok(res) if res.status().is_success() => {
1470 response_opt = Some(res);
1471 break;
1472 }
1473 Ok(res) if res.status().as_u16() >= 500 => {
1474 last_err = format!("LM Studio error {}", res.status());
1475 }
1476 Ok(res) => {
1477 let status = res.status();
1479 let body = res.text().await.unwrap_or_default();
1480 let preview = &body[..body.len().min(300)];
1481 return Err(format!("LM Studio error {}: {}", status, preview));
1482 }
1483 Err(e) if e.is_timeout() || e.is_connect() => {
1484 last_err = format!("Request failed: {}", e);
1485 }
1486 Err(e) => return Err(format!("Request failed: {}", e)),
1487 }
1488 if attempt < 2 {
1489 let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1490 tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1491 }
1492 }
1493 let res = response_opt
1494 .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1495
1496 let body: ChatResponse = res
1497 .json()
1498 .await
1499 .map_err(|e| format!("Response parse error: {}", e))?;
1500
1501 if let Some(usage) = &body.usage {
1502 let mut econ = self.economics.lock().unwrap();
1503 econ.input_tokens += usage.prompt_tokens;
1504 econ.output_tokens += usage.completion_tokens;
1505 }
1506
1507 let choice = body
1508 .choices
1509 .into_iter()
1510 .next()
1511 .ok_or_else(|| "Empty response from model".to_string())?;
1512
1513 let finish_reason = choice.finish_reason;
1514 let mut tool_calls = choice.message.tool_calls;
1515 let mut content = choice.message.content;
1516
1517 if let Some(raw_content) = &content {
1520 let native_calls = extract_native_tool_calls(raw_content);
1521 if !native_calls.is_empty() {
1522 let mut existing = tool_calls.unwrap_or_default();
1523 existing.extend(native_calls);
1524 tool_calls = Some(existing);
1525 let stripped = strip_native_tool_call_text(raw_content);
1526 content = if stripped.trim().is_empty() {
1527 None
1528 } else {
1529 Some(stripped)
1530 };
1531 }
1532 }
1533
1534 if is_gemma4_model_name(&model) {
1535 if let Some(calls) = tool_calls.as_mut() {
1536 for call in calls.iter_mut() {
1537 call.function.arguments = normalize_tool_argument_string(
1538 &call.function.name,
1539 &call.function.arguments,
1540 );
1541 }
1542 }
1543 }
1544
1545 let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1550 if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1551 && content
1552 .as_ref()
1553 .map(|s| s.trim().is_empty())
1554 .unwrap_or(true)
1555 && !reasoning_text.is_empty()
1556 {
1557 let recovered = extract_native_tool_calls(&reasoning_text);
1558 if !recovered.is_empty() {
1559 tool_calls = Some(recovered);
1560 content = None;
1562 } else if finish_reason.as_deref() == Some("stop") {
1563 content = Some(reasoning_text);
1569 }
1570 }
1571
1572 Ok((content, tool_calls, body.usage, finish_reason))
1573 }
1574
1575 pub async fn stream_messages(
1579 &self,
1580 messages: &[ChatMessage],
1581 tx: mpsc::Sender<InferenceEvent>,
1582 ) -> Result<(), Box<dyn std::error::Error>> {
1583 let current_model = self.current_model();
1584 let request_messages = if should_use_gemma_native_formatting(self, ¤t_model) {
1585 prepare_gemma_native_messages(messages)
1586 } else {
1587 messages
1588 .iter()
1589 .map(|m| {
1590 let mut clone = m.clone();
1591 let current_text = m.content.as_str();
1592 if !current_text.starts_with("<|turn>") {
1593 clone.content = MessageContent::Text(format!(
1594 "<|turn>{}\n{}\n<turn|>",
1595 m.role, current_text
1596 ));
1597 }
1598 clone
1599 })
1600 .collect()
1601 };
1602
1603 let request = ChatRequest {
1604 model: current_model.clone(),
1605 messages: request_messages,
1606 temperature: 0.7,
1607 stream: true,
1608 tools: None,
1609 };
1610
1611 if let Err(e) = preflight_chat_request(
1612 ¤t_model,
1613 &request.messages,
1614 &[],
1615 self.current_context_length(),
1616 ) {
1617 let tag = classify_runtime_failure_tag(&e);
1618 let _ = tx
1619 .send(InferenceEvent::ProviderStatus {
1620 state: provider_state_for_failure_tag(tag),
1621 summary: compact_runtime_failure_summary(tag, &e),
1622 })
1623 .await;
1624 let _ = tx
1625 .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1626 .await;
1627 let _ = tx.send(InferenceEvent::Done).await;
1628 return Ok(());
1629 }
1630
1631 let mut last_err = String::new();
1632 let mut response_opt: Option<reqwest::Response> = None;
1633 for attempt in 0..2u32 {
1634 match self.client.post(&self.api_url).json(&request).send().await {
1635 Ok(res) if res.status().is_success() => {
1636 response_opt = Some(res);
1637 break;
1638 }
1639 Ok(res) if res.status().as_u16() >= 500 => {
1640 last_err = format!("LM Studio error {}", res.status());
1641 }
1642 Ok(res) => {
1643 let status = res.status();
1644 let body = res.text().await.unwrap_or_default();
1645 let preview = &body[..body.len().min(300)];
1646 let detail = format!("LM Studio error {}: {}", status, preview);
1647 let tag = classify_runtime_failure_tag(&detail);
1648 let _ = tx
1649 .send(InferenceEvent::ProviderStatus {
1650 state: provider_state_for_failure_tag(tag),
1651 summary: compact_runtime_failure_summary(tag, &detail),
1652 })
1653 .await;
1654 let _ = tx
1655 .send(InferenceEvent::Error(format_runtime_failure_message(
1656 &detail,
1657 )))
1658 .await;
1659 let _ = tx.send(InferenceEvent::Done).await;
1660 return Ok(());
1661 }
1662 Err(e) if e.is_timeout() || e.is_connect() => {
1663 last_err = format!("Request failed: {}", e);
1664 }
1665 Err(e) => {
1666 let detail = format!("Request failed: {}", e);
1667 let tag = classify_runtime_failure_tag(&detail);
1668 let _ = tx
1669 .send(InferenceEvent::ProviderStatus {
1670 state: provider_state_for_failure_tag(tag),
1671 summary: compact_runtime_failure_summary(tag, &detail),
1672 })
1673 .await;
1674 let _ = tx
1675 .send(InferenceEvent::Error(format_runtime_failure_message(
1676 &detail,
1677 )))
1678 .await;
1679 let _ = tx.send(InferenceEvent::Done).await;
1680 return Ok(());
1681 }
1682 }
1683 if attempt < 1 {
1684 let _ = tx
1685 .send(InferenceEvent::ProviderStatus {
1686 state: ProviderRuntimeState::Recovering,
1687 summary: "LM Studio degraded during stream startup; retrying once.".into(),
1688 })
1689 .await;
1690 tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1691 }
1692 }
1693 let Some(res) = response_opt else {
1694 let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1695 let tag = classify_runtime_failure_tag(&detail);
1696 let _ = tx
1697 .send(InferenceEvent::ProviderStatus {
1698 state: provider_state_for_failure_tag(tag),
1699 summary: compact_runtime_failure_summary(tag, &detail),
1700 })
1701 .await;
1702 let _ = tx
1703 .send(InferenceEvent::Error(format_runtime_failure_message(
1704 &detail,
1705 )))
1706 .await;
1707 let _ = tx.send(InferenceEvent::Done).await;
1708 return Ok(());
1709 };
1710
1711 use futures::StreamExt;
1712 let mut byte_stream = res.bytes_stream();
1713
1714 let mut line_buffer = String::new();
1717 let mut content_buffer = String::new();
1718 let mut past_think = false;
1719 let mut emitted_any_content = false;
1720 let mut emitted_live_status = false;
1721
1722 loop {
1725 let next = tokio::select! {
1726 chunk = byte_stream.next() => chunk,
1728 _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1729 if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1730 break;
1731 }
1732 continue;
1733 }
1734 };
1735
1736 let Some(item) = next else { break };
1737
1738 let chunk = match item {
1739 Ok(chunk) => chunk,
1740 Err(e) => {
1741 let detail = format!("Request failed: {}", e);
1742 let tag = classify_runtime_failure_tag(&detail);
1743 let _ = tx
1744 .send(InferenceEvent::ProviderStatus {
1745 state: provider_state_for_failure_tag(tag),
1746 summary: compact_runtime_failure_summary(tag, &detail),
1747 })
1748 .await;
1749 let _ = tx
1750 .send(InferenceEvent::Error(format_runtime_failure_message(
1751 &detail,
1752 )))
1753 .await;
1754 let _ = tx.send(InferenceEvent::Done).await;
1755 return Ok(());
1756 }
1757 };
1758 line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1759
1760 while let Some(pos) = line_buffer.find("\n\n") {
1761 let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1762 let data_pos = match event_str.find("data: ") {
1763 Some(p) => p,
1764 None => continue,
1765 };
1766
1767 let data = event_str[data_pos + 6..].trim();
1768 if data == "[DONE]" {
1769 break;
1770 }
1771
1772 if let Ok(json) = serde_json::from_str::<Value>(data) {
1773 let delta = &json["choices"][0]["delta"];
1774
1775 if let Some(reasoning) = delta["reasoning_content"]
1777 .as_str()
1778 .or_else(|| delta["thought"].as_str())
1779 {
1780 if !reasoning.is_empty() {
1781 past_think = false; content_buffer.push_str(reasoning);
1783 if content_buffer.len() > 30
1784 && (reasoning.contains('\n') || reasoning.contains('.'))
1785 {
1786 let _ = tx
1787 .send(InferenceEvent::Thought(content_buffer.clone()))
1788 .await;
1789 emitted_any_content = true;
1790 content_buffer.clear();
1791 }
1792 }
1793 }
1794
1795 if let Some(content) = delta["content"].as_str() {
1797 if content.is_empty() {
1798 continue;
1799 }
1800
1801 if !past_think && !content_buffer.is_empty() && !content.trim().is_empty() {
1804 let _ = tx
1807 .send(InferenceEvent::Thought(content_buffer.clone()))
1808 .await;
1809 content_buffer.clear();
1810 past_think = true;
1811 }
1812
1813 if !past_think {
1814 let lc = content.to_lowercase();
1815 let close = lc
1816 .find("<channel|>")
1817 .map(|i| (i, "<channel|>".len()))
1818 .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1819
1820 if let Some((tag_start, tag_len)) = close {
1821 let before = &content[..tag_start];
1823 content_buffer.push_str(before);
1824 if !content_buffer.trim().is_empty() {
1825 let _ = tx
1826 .send(InferenceEvent::Thought(content_buffer.clone()))
1827 .await;
1828 emitted_any_content = true;
1829 }
1830 content_buffer.clear();
1831
1832 past_think = true;
1833 let after = content[tag_start + tag_len..].trim_start_matches('\n');
1834 content_buffer.push_str(after);
1835 } else {
1836 content_buffer.push_str(content);
1838 if content_buffer.len() > 30
1839 && (content.contains('\n') || content.contains('.'))
1840 {
1841 let _ = tx
1842 .send(InferenceEvent::Thought(content_buffer.clone()))
1843 .await;
1844 emitted_any_content = true;
1845 content_buffer.clear();
1846 }
1847 }
1848 } else {
1849 content_buffer.push_str(content);
1851 let is_boundary = content.contains(' ')
1852 || content.contains('.')
1853 || content.contains('!')
1854 || content.contains('?');
1855
1856 if content_buffer.len() > 10 && is_boundary {
1857 if !emitted_live_status {
1858 let _ = tx
1859 .send(InferenceEvent::ProviderStatus {
1860 state: ProviderRuntimeState::Live,
1861 summary: String::new(),
1862 })
1863 .await;
1864 emitted_live_status = true;
1865 }
1866 let _ =
1867 tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1868 emitted_any_content = true;
1869 content_buffer.clear();
1870 }
1871 }
1872 }
1873 }
1874 }
1875 }
1876
1877 if !content_buffer.is_empty() {
1879 if past_think {
1880 if !emitted_live_status {
1881 let _ = tx
1882 .send(InferenceEvent::ProviderStatus {
1883 state: ProviderRuntimeState::Live,
1884 summary: String::new(),
1885 })
1886 .await;
1887 }
1888 let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1889 } else {
1890 let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1891 }
1892 emitted_any_content = true;
1893 }
1894
1895 if !emitted_any_content {
1896 let _ = tx
1897 .send(InferenceEvent::ProviderStatus {
1898 state: ProviderRuntimeState::EmptyResponse,
1899 summary: compact_runtime_failure_summary(
1900 "empty_model_response",
1901 "Empty response from model",
1902 ),
1903 })
1904 .await;
1905 let _ = tx
1906 .send(InferenceEvent::Error(format_runtime_failure_message(
1907 "Empty response from model",
1908 )))
1909 .await;
1910 let _ = tx.send(InferenceEvent::Done).await;
1911 return Ok(());
1912 }
1913
1914 let _ = tx.send(InferenceEvent::Done).await;
1915 Ok(())
1916 }
1917
1918 pub async fn stream_generation(
1920 &self,
1921 prompt: &str,
1922 snark: u8,
1923 chaos: u8,
1924 brief: bool,
1925 professional: bool,
1926 tx: mpsc::Sender<InferenceEvent>,
1927 ) -> Result<(), Box<dyn std::error::Error>> {
1928 let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1929 let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1930 self.stream_messages(&messages, tx).await
1931 }
1932
1933 pub async fn generate_task_worker(
1937 &self,
1938 prompt: &str,
1939 professional: bool,
1940 ) -> Result<String, String> {
1941 let current_model = self.current_model();
1942 let model = self
1943 .worker_model
1944 .as_deref()
1945 .unwrap_or(current_model.as_str());
1946 self.generate_task_with_model(prompt, 0.1, professional, model)
1947 .await
1948 }
1949
1950 pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1951 self.generate_task_with_temp(prompt, 0.1, professional)
1952 .await
1953 }
1954
1955 pub async fn generate_task_with_temp(
1956 &self,
1957 prompt: &str,
1958 temp: f32,
1959 professional: bool,
1960 ) -> Result<String, String> {
1961 let current_model = self.current_model();
1962 self.generate_task_with_model(prompt, temp, professional, ¤t_model)
1963 .await
1964 }
1965
1966 pub async fn generate_task_with_model(
1967 &self,
1968 prompt: &str,
1969 temp: f32,
1970 professional: bool,
1971 model: &str,
1972 ) -> Result<String, String> {
1973 let _permit = self
1974 .kv_semaphore
1975 .acquire()
1976 .await
1977 .map_err(|e| e.to_string())?;
1978
1979 let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1980 let request_messages = if should_use_gemma_native_formatting(self, model) {
1981 prepare_gemma_native_messages(&[
1982 ChatMessage::system(&system),
1983 ChatMessage::user(prompt),
1984 ])
1985 } else {
1986 vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1987 };
1988 let request = ChatRequest {
1989 model: model.to_string(),
1990 messages: request_messages,
1991 temperature: temp,
1992 stream: false,
1993 tools: None,
1994 };
1995
1996 preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1997
1998 let res = self
1999 .client
2000 .post(&self.api_url)
2001 .json(&request)
2002 .send()
2003 .await
2004 .map_err(|e| format!("LM Studio request failed: {}", e))?;
2005
2006 let body: ChatResponse = res
2007 .json()
2008 .await
2009 .map_err(|e| format!("Failed to parse response: {}", e))?;
2010
2011 body.choices
2012 .first()
2013 .and_then(|c| c.message.content.clone())
2014 .ok_or_else(|| "Empty response from model".to_string())
2015 }
2016
2017 #[allow(dead_code)]
2021 pub fn snip_history(
2022 &self,
2023 turns: &[ChatMessage],
2024 max_tokens_estimate: usize,
2025 keep_recent: usize,
2026 ) -> Vec<ChatMessage> {
2027 let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
2028 if total_chars / 4 <= max_tokens_estimate {
2029 return turns.to_vec();
2030 }
2031 let keep = keep_recent.min(turns.len());
2032 let mut snipped = vec![turns[0].clone()];
2033 if turns.len() > keep + 1 {
2034 snipped.push(ChatMessage::system(&format!(
2035 "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2036 turns.len() - keep - 1
2037 )));
2038 snipped.extend_from_slice(&turns[turns.len() - keep..]);
2039 } else {
2040 snipped = turns.to_vec();
2041 }
2042 snipped
2043 }
2044}
2045
2046fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2047 serde_json::to_vec(value)
2048 .ok()
2049 .map_or(0, |bytes| bytes.len() / 4 + 1)
2050}
2051
2052const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2053
2054fn estimate_message_tokens(message: &ChatMessage) -> usize {
2055 let content_tokens = match &message.content {
2056 MessageContent::Text(s) => s.len() / 4 + 1,
2057 MessageContent::Parts(parts) => parts
2058 .iter()
2059 .map(|part| match part {
2060 ContentPart::Text { text } => text.len() / 4 + 1,
2061 ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2064 })
2065 .sum(),
2066 };
2067 let tool_tokens: usize = message
2068 .tool_calls
2069 .iter()
2070 .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2071 .sum();
2072 content_tokens + tool_tokens + 6
2073}
2074
2075pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2076 messages.iter().map(estimate_message_tokens).sum()
2077}
2078
2079fn reserved_output_tokens(context_length: usize) -> usize {
2080 let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2081 proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2082}
2083
2084pub fn estimate_prompt_pressure(
2085 messages: &[ChatMessage],
2086 tools: &[ToolDefinition],
2087 context_length: usize,
2088) -> (usize, usize, usize, u8) {
2089 let estimated_input_tokens =
2090 estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2091 let reserved_output = reserved_output_tokens(context_length);
2092 let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2093 let percent = if context_length == 0 {
2094 0
2095 } else {
2096 ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2097 };
2098 (
2099 estimated_input_tokens,
2100 reserved_output,
2101 estimated_total,
2102 percent,
2103 )
2104}
2105
2106fn preflight_chat_request(
2107 model: &str,
2108 messages: &[ChatMessage],
2109 tools: &[ToolDefinition],
2110 context_length: usize,
2111) -> Result<(), String> {
2112 let (estimated_input_tokens, reserved_output, estimated_total, _) =
2113 estimate_prompt_pressure(messages, tools, context_length);
2114
2115 if estimated_total > context_length {
2116 return Err(format!(
2117 "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2118 model, estimated_input_tokens, reserved_output, estimated_total, context_length
2119 ));
2120 }
2121
2122 Ok(())
2123}
2124
2125fn load_instruction_files() -> String {
2129 use std::collections::hash_map::DefaultHasher;
2130 use std::collections::HashSet;
2131 use std::hash::{Hash, Hasher};
2132
2133 let Ok(cwd) = std::env::current_dir() else {
2134 return String::new();
2135 };
2136 let mut result = String::new();
2137 let mut seen: HashSet<u64> = HashSet::new();
2138 let mut total_chars: usize = 0;
2139 const MAX_TOTAL: usize = 12_000;
2140 const MAX_PER_FILE: usize = 4_000;
2141
2142 let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2143
2144 let mut dir = cwd.clone();
2145 for _ in 0..4 {
2146 for name in &candidates {
2147 let path = dir.join(name);
2148 if !path.exists() {
2149 continue;
2150 }
2151 let Ok(content) = std::fs::read_to_string(&path) else {
2152 continue;
2153 };
2154 if content.trim().is_empty() {
2155 continue;
2156 }
2157
2158 let mut hasher = DefaultHasher::new();
2159 content.hash(&mut hasher);
2160 let h = hasher.finish();
2161 if !seen.insert(h) {
2162 continue;
2163 }
2164
2165 let truncated = if content.len() > MAX_PER_FILE {
2166 format!("{}...[truncated]", &content[..MAX_PER_FILE])
2167 } else {
2168 content
2169 };
2170
2171 if total_chars + truncated.len() > MAX_TOTAL {
2172 break;
2173 }
2174 total_chars += truncated.len();
2175 result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2176 }
2177 match dir.parent().map(|p| p.to_owned()) {
2178 Some(p) => dir = p,
2179 None => break,
2180 }
2181 }
2182
2183 if result.is_empty() {
2184 return String::new();
2185 }
2186 format!("\n\n# Project Instructions\n{}", result)
2187}
2188
2189pub fn extract_think_block(text: &str) -> Option<String> {
2190 let lower = text.to_lowercase();
2191
2192 let open_tag = "<|channel>thought";
2194 let close_tag = "<channel|>";
2195
2196 let start_pos = lower.find(open_tag)?;
2197 let content_start = start_pos + open_tag.len();
2198
2199 let close_pos = lower[content_start..]
2200 .find(close_tag)
2201 .map(|p| content_start + p)
2202 .unwrap_or(text.len());
2203
2204 let content = text[content_start..close_pos].trim();
2205 if content.is_empty() {
2206 None
2207 } else {
2208 Some(content.to_string())
2209 }
2210}
2211
2212pub fn strip_think_blocks(text: &str) -> String {
2213 let text = {
2217 let t = text.trim_start();
2218 if t.to_lowercase().starts_with("</think>") {
2219 &t[8..]
2220 } else {
2221 text
2222 }
2223 };
2224
2225 let lower = text.to_lowercase();
2226
2227 if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2229 let answer = text[end..]
2230 .replace("<|channel>thought", "")
2231 .replace("<channel|>", "");
2232 return answer.trim().replace("\n\n\n", "\n\n").to_string();
2233 }
2234
2235 let first_open = [
2237 lower.find("<|channel>thought"), lower.find("<think>"),
2239 lower.find("<thought>"),
2240 lower.find("<|think|>"),
2241 ]
2242 .iter()
2243 .filter_map(|&x| x)
2244 .min();
2245
2246 if let Some(start) = first_open {
2247 if start > 0 {
2248 return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2249 }
2250 return String::new();
2251 }
2252
2253 let naked_reasoning_phrases: &[&str] = &[
2257 "the user asked",
2258 "the user is asking",
2259 "the user wants",
2260 "i will structure",
2261 "i should provide",
2262 "i should give",
2263 "i should avoid",
2264 "i should note",
2265 "i should focus",
2266 "i should keep",
2267 "i should respond",
2268 "i should present",
2269 "i should display",
2270 "i should show",
2271 "i need to",
2272 "i can see from",
2273 "without being overly",
2274 "let me ",
2275 "necessary information in my identity",
2276 "was computed successfully",
2277 "computed successfully",
2278 ];
2279 let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2280 if is_naked_reasoning {
2281 let lines: Vec<&str> = text.lines().collect();
2282 if !lines.is_empty() {
2283 let mut start_idx = 0;
2286 for (i, line) in lines.iter().enumerate() {
2287 let l = line.to_lowercase();
2288 let is_reasoning_line =
2289 naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2290 if is_reasoning_line {
2291 start_idx = i + 1;
2292 } else {
2293 break;
2294 }
2295 }
2296 if start_idx < lines.len() {
2297 return lines[start_idx..]
2298 .join("\n")
2299 .trim()
2300 .replace("\n\n\n", "\n\n")
2301 .to_string();
2302 }
2303 return String::new();
2305 }
2306 }
2307
2308 let cleaned = strip_xml_tool_call_artifacts(text);
2311 cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2312}
2313
2314fn strip_xml_tool_call_artifacts(text: &str) -> String {
2317 const XML_ARTIFACTS: &[&str] = &[
2319 "</tool_call>",
2320 "<tool_call>",
2321 "</function>",
2322 "<function>",
2323 "</parameter>",
2324 "<parameter>",
2325 "</arguments>",
2326 "<arguments>",
2327 "</tool_use>",
2328 "<tool_use>",
2329 "</invoke>",
2330 "<invoke>",
2331 "</think>",
2333 "</thought>",
2334 "</thinking>",
2335 ];
2336 let mut out = text.to_string();
2337 for tag in XML_ARTIFACTS {
2338 while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2340 out.drain(pos..pos + tag.len());
2341 }
2342 }
2343 out
2345}
2346
2347pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2350 use regex::Regex;
2351 let mut results = Vec::new();
2352
2353 let re_call = Regex::new(
2355 r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2356 ).unwrap();
2357 let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2358
2359 for cap in re_call.captures_iter(text) {
2360 let name = cap[1].to_string();
2361 let args_str = &cap[2];
2362 let mut arguments = serde_json::Map::new();
2363
2364 for arg_cap in re_arg.captures_iter(args_str) {
2365 let key = arg_cap[1].to_string();
2366 let val_raw = arg_cap
2367 .get(2)
2368 .map(|m| m.as_str())
2369 .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2370 .unwrap_or("")
2371 .trim();
2372 let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2373
2374 let val = if normalized_raw == "true" {
2375 Value::Bool(true)
2376 } else if normalized_raw == "false" {
2377 Value::Bool(false)
2378 } else if let Ok(n) = normalized_raw.parse::<i64>() {
2379 Value::Number(n.into())
2380 } else if let Ok(n) = normalized_raw.parse::<u64>() {
2381 Value::Number(n.into())
2382 } else if let Ok(n) = normalized_raw.parse::<f64>() {
2383 serde_json::Number::from_f64(n)
2384 .map(Value::Number)
2385 .unwrap_or(Value::String(normalized_raw.clone()))
2386 } else {
2387 Value::String(normalized_raw)
2388 };
2389
2390 arguments.insert(key, val);
2391 }
2392
2393 results.push(ToolCallResponse {
2394 id: format!("call_{}", rand::random::<u32>()),
2395 call_type: "function".to_string(),
2396 function: ToolCallFn {
2397 name,
2398 arguments: Value::Object(arguments).to_string(),
2399 },
2400 });
2401 }
2402
2403 let re_xml_call = Regex::new(
2405 r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
2406 ).unwrap();
2407 let re_xml_param =
2408 Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
2409
2410 for cap in re_xml_call.captures_iter(text) {
2411 let name = cap[1].to_string();
2412 let body = &cap[2];
2413 let mut arguments = serde_json::Map::new();
2414
2415 for p_cap in re_xml_param.captures_iter(body) {
2416 let key = p_cap[1].to_string();
2417 let val_raw = p_cap[2].trim();
2418 let val = if val_raw == "true" {
2419 Value::Bool(true)
2420 } else if val_raw == "false" {
2421 Value::Bool(false)
2422 } else if let Ok(n) = val_raw.parse::<i64>() {
2423 Value::Number(n.into())
2424 } else if let Ok(n) = val_raw.parse::<u64>() {
2425 Value::Number(n.into())
2426 } else {
2427 Value::String(val_raw.to_string())
2428 };
2429 arguments.insert(key, val);
2430 }
2431
2432 if !arguments.is_empty() || name == "clear_context" {
2433 results.push(ToolCallResponse {
2434 id: format!("call_{}", rand::random::<u32>()),
2435 call_type: "function".to_string(),
2436 function: ToolCallFn {
2437 name,
2438 arguments: Value::Object(arguments).to_string(),
2439 },
2440 });
2441 }
2442 }
2443
2444 results
2445}
2446
2447pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2448 let trimmed = raw.trim();
2449 let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2450
2451 let mut value = match serde_json::from_str::<Value>(&candidate) {
2452 Ok(v) => v,
2453 Err(_) => return candidate,
2454 };
2455 normalize_tool_argument_value(tool_name, &mut value);
2456 value.to_string()
2457}
2458
2459fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2460 match value {
2461 Value::String(s) => *s = normalize_string_arg(s),
2462 Value::Array(items) => {
2463 for item in items {
2464 normalize_tool_argument_value(tool_name, item);
2465 }
2466 }
2467 Value::Object(map) => {
2468 for val in map.values_mut() {
2469 normalize_tool_argument_value(tool_name, val);
2470 }
2471 if tool_name == "grep_files" {
2472 if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2473 *pattern = normalize_regex_pattern(pattern);
2474 }
2475 }
2476 for key in ["path", "extension", "query", "command", "reason"] {
2477 if let Some(Value::String(s)) = map.get_mut(key) {
2478 *s = normalize_string_arg(s);
2479 }
2480 }
2481 }
2482 _ => {}
2483 }
2484}
2485
2486fn unwrap_json_string_once(input: &str) -> Option<String> {
2487 if input.len() < 2 {
2488 return None;
2489 }
2490 let first = input.chars().next()?;
2491 let last = input.chars().last()?;
2492 if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2493 return None;
2494 }
2495 let inner = &input[1..input.len() - 1];
2496 let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2497 Some(unescaped.trim().to_string())
2498}
2499
2500fn normalize_string_arg(input: &str) -> String {
2501 let mut out = input.trim().to_string();
2502 while out.len() >= 2 {
2503 let mut changed = false;
2504 for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2505 if out.starts_with(start) && out.ends_with(end) {
2506 out = out[start.len()..out.len() - end.len()].trim().to_string();
2507 changed = true;
2508 break;
2509 }
2510 }
2511 if !changed {
2512 break;
2513 }
2514 }
2515 out
2516}
2517
2518fn normalize_regex_pattern(input: &str) -> String {
2519 let out = normalize_string_arg(input);
2520 if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2521 out[1..out.len() - 1].to_string()
2522 } else {
2523 out
2524 }
2525}
2526
2527fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2528 let mut system_blocks = Vec::new();
2529 let mut prepared = Vec::new();
2530 let mut seeded = false;
2531
2532 for message in messages {
2533 if message.role == "system" {
2534 let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2535 .trim()
2536 .to_string();
2537 if !cleaned.is_empty() {
2538 system_blocks.push(cleaned);
2539 }
2540 continue;
2541 }
2542
2543 let mut clone = message.clone();
2544 clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2545
2546 if !seeded && message.role == "user" {
2547 let mut merged = String::new();
2548 if !system_blocks.is_empty() {
2549 merged.push_str("System instructions for this turn:\n");
2550 merged.push_str(&system_blocks.join("\n\n"));
2551 merged.push_str("\n\n");
2552 }
2553 merged.push_str(clone.content.as_str());
2554 clone.content = MessageContent::Text(merged);
2555 seeded = true;
2556 }
2557
2558 prepared.push(clone);
2559 }
2560
2561 if !seeded && !system_blocks.is_empty() {
2562 prepared.insert(
2563 0,
2564 ChatMessage::user(&format!(
2565 "System instructions for this turn:\n{}",
2566 system_blocks.join("\n\n")
2567 )),
2568 );
2569 }
2570
2571 prepared
2572}
2573
2574fn strip_legacy_turn_wrappers(text: &str) -> String {
2575 text.replace("<|turn>system\n", "")
2576 .replace("<|turn>user\n", "")
2577 .replace("<|turn>assistant\n", "")
2578 .replace("<|turn>tool\n", "")
2579 .replace("<turn|>", "")
2580 .trim()
2581 .to_string()
2582}
2583
2584pub fn strip_native_tool_call_text(text: &str) -> String {
2585 use regex::Regex;
2586 let re_call = Regex::new(
2588 r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2589 ).unwrap();
2590 let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
2592 let re_response =
2593 Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2594 .unwrap();
2595 let without_calls = re_call.replace_all(text, "");
2596 let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
2597 re_response
2598 .replace_all(without_xml.as_ref(), "")
2599 .trim()
2600 .to_string()
2601}
2602
2603#[cfg(test)]
2604mod tests {
2605 use super::*;
2606
2607 #[test]
2608 fn system_prompt_includes_running_hematite_version() {
2609 let engine = InferenceEngine::new(
2610 "http://localhost:1234/v1".to_string(),
2611 "strategist".to_string(),
2612 0,
2613 )
2614 .expect("engine");
2615
2616 let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2617 assert!(system.contains(crate::HEMATITE_VERSION));
2618 }
2619
2620 #[test]
2621 fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2622 let text = r#"<|channel>thought
2623Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2624<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2625
2626 let calls = extract_native_tool_calls(text);
2627 assert_eq!(calls.len(), 1);
2628 assert_eq!(calls[0].function.name, "read_file");
2629
2630 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2631 assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2632 assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2633 assert_eq!(
2634 args.get("path").and_then(|v| v.as_str()),
2635 Some("src/ui/tui.rs")
2636 );
2637
2638 let stripped = strip_native_tool_call_text(text);
2639 assert!(!stripped.contains("<|tool_call"));
2640 assert!(!stripped.contains("<tool_call|>"));
2641 }
2642
2643 #[test]
2644 fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2645 let text = r#"<|channel>thought
2646Planning.
2647<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2648Mapped src.
2649<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2650Read main.
2651<channel|>"#;
2652
2653 let calls = extract_native_tool_calls(text);
2654 assert_eq!(calls.len(), 2);
2655 assert_eq!(calls[0].function.name, "list_files");
2656 assert_eq!(calls[1].function.name, "read_file");
2657
2658 let stripped = strip_native_tool_call_text(text);
2659 assert!(!stripped.contains("<|tool_call"));
2660 assert!(!stripped.contains("<|tool_response"));
2661 assert!(!stripped.contains("<tool_response|>"));
2662 }
2663
2664 #[test]
2665 fn extracts_qwen_xml_tool_calls_from_reasoning() {
2666 let text = r#"Based on the project structure, I need to check the binary.
2667<tool_call>
2668<function=shell>
2669<parameter=command>
2670ls -la hematite.exe
2671</parameter>
2672<parameter=reason>
2673Check if the binary exists
2674</parameter>
2675</function>
2676</tool_call>"#;
2677
2678 let calls = extract_native_tool_calls(text);
2679 assert_eq!(calls.len(), 1);
2680 assert_eq!(calls[0].function.name, "shell");
2681
2682 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2683 assert_eq!(
2684 args.get("command").and_then(|v| v.as_str()),
2685 Some("ls -la hematite.exe")
2686 );
2687 assert_eq!(
2688 args.get("reason").and_then(|v| v.as_str()),
2689 Some("Check if the binary exists")
2690 );
2691
2692 let stripped = strip_native_tool_call_text(text);
2693 assert!(!stripped.contains("<tool_call>"));
2694 assert!(!stripped.contains("<function=shell>"));
2695 }
2696}