1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7pub struct InferenceEngine {
10 pub client: reqwest::Client,
11 pub api_url: String,
12 pub base_url: String,
15 pub species: String,
16 pub snark: u8,
17 pub kv_semaphore: Semaphore,
18 pub model: std::sync::RwLock<String>,
20 pub context_length: std::sync::atomic::AtomicUsize,
22 pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23 pub worker_model: Option<String>,
25 pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27 pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_hematite_native_model(model: &str) -> bool {
32 let lower = model.to_ascii_lowercase();
33 lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37 is_hematite_native_model(model) && engine.gemma_native_formatting_enabled()
38}
39
40#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44 #[serde(rename = "type")]
45 pub tool_type: String,
46 pub function: ToolFunction,
47 #[serde(skip_serializing, skip_deserializing)]
48 pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53 pub name: String,
54 pub description: String,
55 pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60 RepoRead,
61 RepoWrite,
62 Runtime,
63 Architecture,
64 Toolchain,
65 Verification,
66 Git,
67 Research,
68 Vision,
69 Lsp,
70 Workflow,
71 External,
72 Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77 pub category: ToolCategory,
78 pub mutates_workspace: bool,
79 pub external_surface: bool,
80 pub trust_sensitive: bool,
81 pub read_only_friendly: bool,
82 pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86 if name.starts_with("mcp__") {
87 let lower = name.to_ascii_lowercase();
88 let mutates_workspace = [
89 "__edit",
90 "__write",
91 "__create",
92 "__move",
93 "__delete",
94 "__remove",
95 "__rename",
96 "__replace",
97 "__patch",
98 ]
99 .iter()
100 .any(|needle| lower.contains(needle));
101 return ToolMetadata {
102 category: ToolCategory::External,
103 mutates_workspace,
104 external_surface: true,
105 trust_sensitive: true,
106 read_only_friendly: !mutates_workspace,
107 plan_scope: false,
108 };
109 }
110
111 match name {
112 "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113 category: ToolCategory::RepoRead,
114 mutates_workspace: false,
115 external_surface: false,
116 trust_sensitive: false,
117 read_only_friendly: true,
118 plan_scope: true,
119 },
120 "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121 category: ToolCategory::RepoWrite,
122 mutates_workspace: true,
123 external_surface: false,
124 trust_sensitive: true,
125 read_only_friendly: false,
126 plan_scope: true,
127 },
128 "trace_runtime_flow" => ToolMetadata {
129 category: ToolCategory::Architecture,
130 mutates_workspace: false,
131 external_surface: false,
132 trust_sensitive: false,
133 read_only_friendly: true,
134 plan_scope: false,
135 },
136 "describe_toolchain" => ToolMetadata {
137 category: ToolCategory::Toolchain,
138 mutates_workspace: false,
139 external_surface: false,
140 trust_sensitive: false,
141 read_only_friendly: true,
142 plan_scope: false,
143 },
144 "shell" => ToolMetadata {
145 category: ToolCategory::Runtime,
146 mutates_workspace: true,
147 external_surface: false,
148 trust_sensitive: true,
149 read_only_friendly: false,
150 plan_scope: false,
151 },
152 "inspect_host" => ToolMetadata {
153 category: ToolCategory::Runtime,
154 mutates_workspace: false,
155 external_surface: false,
156 trust_sensitive: false,
157 read_only_friendly: true,
158 plan_scope: false,
159 },
160 "resolve_host_issue" => ToolMetadata {
161 category: ToolCategory::Runtime,
162 mutates_workspace: true,
163 external_surface: true,
164 trust_sensitive: true,
165 read_only_friendly: false,
166 plan_scope: false,
167 },
168 "run_hematite_maintainer_workflow" => ToolMetadata {
169 category: ToolCategory::Workflow,
170 mutates_workspace: true,
171 external_surface: false,
172 trust_sensitive: true,
173 read_only_friendly: false,
174 plan_scope: false,
175 },
176 "run_workspace_workflow" => ToolMetadata {
177 category: ToolCategory::Workflow,
178 mutates_workspace: true,
179 external_surface: false,
180 trust_sensitive: true,
181 read_only_friendly: false,
182 plan_scope: false,
183 },
184 "verify_build" => ToolMetadata {
185 category: ToolCategory::Verification,
186 mutates_workspace: false,
187 external_surface: false,
188 trust_sensitive: false,
189 read_only_friendly: true,
190 plan_scope: true,
191 },
192 "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
193 ToolMetadata {
194 category: ToolCategory::Git,
195 mutates_workspace: true,
196 external_surface: false,
197 trust_sensitive: true,
198 read_only_friendly: false,
199 plan_scope: false,
200 }
201 }
202 "research_web" | "fetch_docs" => ToolMetadata {
203 category: ToolCategory::Research,
204 mutates_workspace: false,
205 external_surface: false,
206 trust_sensitive: false,
207 read_only_friendly: true,
208 plan_scope: false,
209 },
210 "vision_analyze" => ToolMetadata {
211 category: ToolCategory::Vision,
212 mutates_workspace: false,
213 external_surface: false,
214 trust_sensitive: false,
215 read_only_friendly: true,
216 plan_scope: false,
217 },
218 "lsp_definitions"
219 | "lsp_references"
220 | "lsp_hover"
221 | "lsp_rename_symbol"
222 | "lsp_get_diagnostics"
223 | "lsp_search_symbol" => ToolMetadata {
224 category: ToolCategory::Lsp,
225 mutates_workspace: false,
226 external_surface: false,
227 trust_sensitive: false,
228 read_only_friendly: true,
229 plan_scope: false,
230 },
231 "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
232 category: ToolCategory::Workflow,
233 mutates_workspace: false,
234 external_surface: false,
235 trust_sensitive: false,
236 read_only_friendly: true,
237 plan_scope: true,
238 },
239 "manage_tasks" => ToolMetadata {
240 category: ToolCategory::Workflow,
241 mutates_workspace: false,
242 external_surface: false,
243 trust_sensitive: false,
244 read_only_friendly: true,
245 plan_scope: false,
246 },
247 _ => ToolMetadata {
248 category: ToolCategory::Other,
249 mutates_workspace: false,
250 external_surface: false,
251 trust_sensitive: false,
252 read_only_friendly: true,
253 plan_scope: false,
254 },
255 }
256}
257
258#[derive(Serialize, Deserialize, Clone, Debug)]
263pub struct ChatMessage {
264 pub role: String,
265 pub content: MessageContent,
267 #[serde(default, skip_serializing_if = "Vec::is_empty")]
269 pub tool_calls: Vec<ToolCallResponse>,
270 #[serde(skip_serializing_if = "Option::is_none")]
272 pub tool_call_id: Option<String>,
273 #[serde(skip_serializing_if = "Option::is_none")]
275 pub name: Option<String>,
276}
277
278#[derive(Serialize, Deserialize, Clone, Debug)]
279#[serde(untagged)]
280pub enum MessageContent {
281 Text(String),
282 Parts(Vec<ContentPart>),
283}
284
285#[derive(Serialize, Deserialize, Clone, Debug)]
286#[serde(tag = "type")]
287pub enum ContentPart {
288 #[serde(rename = "text")]
289 Text { text: String },
290 #[serde(rename = "image_url")]
291 ImageUrl { image_url: ImageUrlSource },
292}
293
294#[derive(Serialize, Deserialize, Clone, Debug)]
295pub struct ImageUrlSource {
296 pub url: String,
297}
298
299impl Default for MessageContent {
300 fn default() -> Self {
301 MessageContent::Text(String::new())
302 }
303}
304
305impl MessageContent {
306 pub fn as_str(&self) -> &str {
307 match self {
308 MessageContent::Text(s) => s,
309 MessageContent::Parts(parts) => {
310 for part in parts {
311 if let ContentPart::Text { text } = part {
312 return text;
313 }
314 }
315 ""
316 }
317 }
318 }
319}
320
321impl ChatMessage {
322 pub fn system(content: &str) -> Self {
323 Self {
324 role: "system".into(),
325 content: MessageContent::Text(content.into()),
326 tool_calls: Vec::new(),
327 tool_call_id: None,
328 name: None,
329 }
330 }
331 pub fn user(content: &str) -> Self {
332 Self {
333 role: "user".into(),
334 content: MessageContent::Text(content.into()),
335 tool_calls: Vec::new(),
336 tool_call_id: None,
337 name: None,
338 }
339 }
340 pub fn user_with_image(text: &str, image_url: &str) -> Self {
341 let mut text_parts = text.to_string();
342 if !text_parts.contains("<|image|>") {
343 text_parts.push_str(" <|image|>");
344 }
345 Self {
346 role: "user".into(),
347 content: MessageContent::Parts(vec![
348 ContentPart::Text { text: text_parts },
349 ContentPart::ImageUrl {
350 image_url: ImageUrlSource {
351 url: image_url.into(),
352 },
353 },
354 ]),
355 tool_calls: Vec::new(),
356 tool_call_id: None,
357 name: None,
358 }
359 }
360 pub fn assistant_text(content: &str) -> Self {
361 Self {
362 role: "assistant".into(),
363 content: MessageContent::Text(content.into()),
364 tool_calls: Vec::new(),
365 tool_call_id: None,
366 name: None,
367 }
368 }
369 pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
370 Self {
371 role: "assistant".into(),
372 content: MessageContent::Text(content.into()),
373 tool_calls: calls,
374 tool_call_id: None,
375 name: None,
376 }
377 }
378 pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
379 Self::tool_result_for_model(tool_call_id, fn_name, content, "")
380 }
381
382 pub fn tool_result_for_model(
385 tool_call_id: &str,
386 fn_name: &str,
387 content: &str,
388 model: &str,
389 ) -> Self {
390 let body = if is_hematite_native_model(model) {
391 format!(
392 "<|tool_response>response:{}{}{}<tool_response|>",
393 fn_name, "{", content
394 )
395 } else {
396 content.to_string()
397 };
398 Self {
399 role: "tool".into(),
400 content: MessageContent::Text(body),
401 tool_calls: Vec::new(),
402 tool_call_id: Some(tool_call_id.into()),
403 name: Some(fn_name.into()),
404 }
405 }
406}
407
408#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallResponse {
412 pub id: String,
413 #[serde(rename = "type")]
414 pub call_type: String,
415 pub function: ToolCallFn,
416}
417
418#[derive(Serialize, Deserialize, Clone, Debug)]
419pub struct ToolCallFn {
420 pub name: String,
421 pub arguments: String,
423}
424
425#[derive(Serialize)]
428struct ChatRequest {
429 model: String,
430 messages: Vec<ChatMessage>,
431 temperature: f32,
432 stream: bool,
433 #[serde(skip_serializing_if = "Option::is_none")]
434 tools: Option<Vec<ToolDefinition>>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ChatResponse {
439 choices: Vec<ResponseChoice>,
440 usage: Option<TokenUsage>,
441}
442
443#[derive(Deserialize, Debug, Clone)]
444pub struct TokenUsage {
445 pub prompt_tokens: usize,
446 pub completion_tokens: usize,
447 pub total_tokens: usize,
448 #[serde(default)]
449 pub prompt_cache_hit_tokens: usize,
450 #[serde(default)]
451 pub cache_read_input_tokens: usize,
452}
453
454#[derive(Deserialize, Debug)]
455struct ResponseChoice {
456 message: ResponseMessage,
457 #[serde(default)]
458 finish_reason: Option<String>,
459}
460
461#[derive(Deserialize, Debug)]
462struct ResponseMessage {
463 content: Option<String>,
464 tool_calls: Option<Vec<ToolCallResponse>>,
465 #[serde(default)]
469 reasoning_content: Option<String>,
470}
471
472const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
473const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
474
475fn is_tiny_context_window(context_length: usize) -> bool {
476 context_length <= 8_192
477}
478
479fn is_compact_context_window(context_length: usize) -> bool {
480 context_length > 8_192 && context_length <= 49_152
481}
482
483pub fn is_compact_context_window_pub(context_length: usize) -> bool {
484 is_compact_context_window(context_length)
485}
486
487fn is_provider_context_limit_detail(lower: &str) -> bool {
488 (lower.contains("n_keep") && lower.contains("n_ctx"))
489 || lower.contains("context length")
490 || lower.contains("keep from the initial prompt")
491 || lower.contains("prompt is greater than the context length")
492 || lower.contains("exceeds the context window")
493}
494
495fn classify_runtime_failure_tag(detail: &str) -> &'static str {
496 let lower = detail.to_ascii_lowercase();
497 if lower.contains("context_window_blocked")
498 || lower.contains("context ceiling reached")
499 || lower.contains("exceeds the")
500 || is_provider_context_limit_detail(&lower)
501 {
502 "context_window"
503 } else if lower.contains("empty response from model")
504 || lower.contains("model returned an empty response")
505 {
506 "empty_model_response"
507 } else if lower.contains("action blocked:")
508 || lower.contains("access denied")
509 || lower.contains("declined by user")
510 {
511 "tool_policy_blocked"
512 } else {
513 "provider_degraded"
514 }
515}
516
517fn runtime_failure_guidance(tag: &str) -> &'static str {
518 match tag {
519 "context_window" => {
520 "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
521 }
522 "empty_model_response" => {
523 "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
524 }
525 "tool_policy_blocked" => {
526 "Stay inside the allowed workflow or switch modes before retrying."
527 }
528 _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
529 }
530}
531
532fn format_runtime_failure_message(detail: &str) -> String {
533 let tag = classify_runtime_failure_tag(detail);
534 format!(
535 "[failure:{}] {} Detail: {}",
536 tag,
537 runtime_failure_guidance(tag),
538 detail.trim()
539 )
540}
541
542#[derive(Debug, Clone, Copy, PartialEq, Eq)]
543pub enum ProviderRuntimeState {
544 Booting,
545 Live,
546 Recovering,
547 Degraded,
548 ContextWindow,
549 EmptyResponse,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum McpRuntimeState {
554 Unconfigured,
555 Healthy,
556 Degraded,
557 Failed,
558}
559
560#[derive(Debug, Clone, Copy, PartialEq, Eq)]
561pub enum OperatorCheckpointState {
562 Idle,
563 RecoveringProvider,
564 BudgetReduced,
565 HistoryCompacted,
566 BlockedContextWindow,
567 BlockedPolicy,
568 BlockedRecentFileEvidence,
569 BlockedExactLineWindow,
570 BlockedToolLoop,
571 BlockedVerification,
572}
573
574impl OperatorCheckpointState {
575 pub fn label(self) -> &'static str {
576 match self {
577 OperatorCheckpointState::Idle => "idle",
578 OperatorCheckpointState::RecoveringProvider => "recovering_provider",
579 OperatorCheckpointState::BudgetReduced => "budget_reduced",
580 OperatorCheckpointState::HistoryCompacted => "history_compacted",
581 OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
582 OperatorCheckpointState::BlockedPolicy => "blocked_policy",
583 OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
584 OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
585 OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
586 OperatorCheckpointState::BlockedVerification => "blocked_verification",
587 }
588 }
589}
590
591fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
592 match tag {
593 "context_window" => ProviderRuntimeState::ContextWindow,
594 "empty_model_response" => ProviderRuntimeState::EmptyResponse,
595 _ => ProviderRuntimeState::Degraded,
596 }
597}
598
599fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
600 match tag {
601 "context_window" => {
602 "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
603 .to_string()
604 }
605 "empty_model_response" => {
606 "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
607 .to_string()
608 }
609 "tool_policy_blocked" => {
610 "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
611 .to_string()
612 }
613 _ => {
614 let mut excerpt = detail
615 .split_whitespace()
616 .take(12)
617 .collect::<Vec<_>>()
618 .join(" ");
619 if excerpt.len() > 110 {
620 excerpt.truncate(110);
621 excerpt.push_str("...");
622 }
623 if excerpt.is_empty() {
624 "LM Studio degraded; Hematite will retry once before surfacing a failure."
625 .to_string()
626 } else {
627 format!("LM Studio degraded: {}", excerpt)
628 }
629 }
630 }
631}
632
633#[derive(Debug)]
636pub enum InferenceEvent {
637 Token(String),
639 MutedToken(String),
641 Thought(String),
643 VoiceStatus(String),
645 ToolCallStart {
647 id: String,
648 name: String,
649 args: String,
650 },
651 ToolCallResult {
653 id: String,
654 name: String,
655 output: String,
656 is_error: bool,
657 },
658 ApprovalRequired {
662 id: String,
663 name: String,
664 display: String,
665 diff: Option<String>,
668 mutation_label: Option<String>,
670 responder: tokio::sync::oneshot::Sender<bool>,
671 },
672 Done,
674 ChainImplementPlan,
676 Error(String),
678 ProviderStatus {
680 state: ProviderRuntimeState,
681 summary: String,
682 },
683 OperatorCheckpoint {
685 state: OperatorCheckpointState,
686 summary: String,
687 },
688 RecoveryRecipe { summary: String },
690 McpStatus {
692 state: McpRuntimeState,
693 summary: String,
694 },
695 CompactionPressure {
697 estimated_tokens: usize,
698 threshold_tokens: usize,
699 percent: u8,
700 },
701 PromptPressure {
703 estimated_input_tokens: usize,
704 reserved_output_tokens: usize,
705 estimated_total_tokens: usize,
706 context_length: usize,
707 percent: u8,
708 },
709 TaskProgress {
711 id: String,
712 label: String,
713 progress: u8,
714 },
715 UsageUpdate(TokenUsage),
717 RuntimeProfile {
719 model_id: String,
720 context_length: usize,
721 },
722 VeinStatus {
724 file_count: usize,
725 embedded_count: usize,
726 docs_only: bool,
727 },
728 VeinContext { paths: Vec<String> },
731 SoulReroll {
733 species: String,
734 rarity: String,
735 shiny: bool,
736 personality: String,
737 },
738 CopyDiveInCommand(String),
740 EmbedProfile { model_id: Option<String> },
742 ShellLine(String),
746}
747
748impl InferenceEngine {
751 pub fn new(
752 api_url: String,
753 species: String,
754 snark: u8,
755 ) -> Result<Self, Box<dyn std::error::Error>> {
756 let client = reqwest::Client::builder()
757 .timeout(std::time::Duration::from_secs(180))
758 .build()?;
759
760 let base_url = {
762 let trimmed = api_url.trim_end_matches('/');
763 if let Some(scheme_end) = trimmed.find("://") {
764 let after_scheme = &trimmed[scheme_end + 3..];
765 if let Some(path_start) = after_scheme.find('/') {
766 format!(
767 "{}://{}",
768 &trimmed[..scheme_end],
769 &after_scheme[..path_start]
770 )
771 } else {
772 trimmed.to_string()
773 }
774 } else {
775 trimmed.to_string()
776 }
777 };
778
779 let api_url = if api_url.ends_with("/chat/completions") {
780 api_url
781 } else if api_url.ends_with("/") {
782 format!("{}chat/completions", api_url)
783 } else {
784 format!("{}/chat/completions", api_url)
785 };
786
787 Ok(Self {
788 client,
789 api_url,
790 base_url,
791 species,
792 snark,
793 kv_semaphore: Semaphore::new(3),
794 model: std::sync::RwLock::new(String::new()),
795 context_length: std::sync::atomic::AtomicUsize::new(32_768), economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
797 worker_model: None,
798 gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
799 cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
800 })
801 }
802
803 pub fn set_gemma_native_formatting(&self, enabled: bool) {
804 self.gemma_native_formatting
805 .store(enabled, std::sync::atomic::Ordering::SeqCst);
806 }
807
808 pub fn gemma_native_formatting_enabled(&self) -> bool {
809 self.gemma_native_formatting
810 .load(std::sync::atomic::Ordering::SeqCst)
811 }
812
813 pub fn current_model(&self) -> String {
814 self.model.read().map(|g| g.clone()).unwrap_or_default()
815 }
816
817 pub fn current_context_length(&self) -> usize {
818 self.context_length
819 .load(std::sync::atomic::Ordering::SeqCst)
820 }
821
822 pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
823 if let Ok(mut guard) = self.model.write() {
824 *guard = model.to_string();
825 }
826 self.context_length
827 .store(context_length, std::sync::atomic::Ordering::SeqCst);
828 }
829
830 pub async fn health_check(&self) -> bool {
832 let url = format!("{}/v1/models", self.base_url);
833 match self.client.get(&url).send().await {
834 Ok(resp) => resp.status().is_success(),
835 Err(_) => false,
836 }
837 }
838
839 pub async fn get_loaded_model(&self) -> Option<String> {
847 #[derive(Deserialize)]
848 struct ModelList {
849 data: Vec<ModelEntry>,
850 }
851 #[derive(Deserialize)]
852 struct ModelEntry {
853 id: String,
854 #[serde(rename = "type", default)]
855 model_type: String,
856 #[serde(default)]
857 state: String,
858 }
859
860 if let Ok(resp) = self
862 .client
863 .get(format!("{}/api/v0/models", self.base_url))
864 .send()
865 .await
866 {
867 if let Ok(list) = resp.json::<ModelList>().await {
868 let chat_model = list
869 .data
870 .into_iter()
871 .find(|m| m.model_type != "embeddings" && m.state == "loaded")
872 .map(|m| m.id)
873 .unwrap_or_default();
874 return Some(chat_model);
875 }
876 }
877
878 let resp = self
880 .client
881 .get(format!("{}/v1/models", self.base_url))
882 .send()
883 .await
884 .ok()?;
885 let list: ModelList = resp.json().await.ok()?;
886 Some(
887 list.data
888 .into_iter()
889 .find(|m| !m.id.to_lowercase().contains("embed"))
890 .map(|m| m.id)
891 .unwrap_or_default(),
892 )
893 }
894
895 pub async fn get_embedding_model(&self) -> Option<String> {
901 #[derive(Deserialize)]
902 struct ModelList {
903 data: Vec<ModelEntry>,
904 }
905 #[derive(Deserialize)]
906 struct ModelEntry {
907 id: String,
908 #[serde(rename = "type", default)]
909 model_type: String,
910 #[serde(default)]
911 state: String,
912 }
913 let resp = self
914 .client
915 .get(format!("{}/api/v0/models", self.base_url))
916 .send()
917 .await
918 .ok()?;
919 let list: ModelList = resp.json().await.ok()?;
920 list.data
921 .into_iter()
922 .find(|m| m.model_type == "embeddings" && m.state == "loaded")
923 .map(|m| m.id)
924 }
925
926 pub async fn detect_context_length(&self) -> usize {
932 #[derive(Deserialize)]
933 struct LmStudioModel {
934 id: Option<String>,
935 #[serde(rename = "type", default)]
936 model_type: String,
937 state: Option<String>,
938 loaded_context_length: Option<u64>,
939 context_length: Option<u64>,
940 max_context_length: Option<u64>,
941 }
942 #[derive(Deserialize)]
943 struct LmStudioList {
944 data: Vec<LmStudioModel>,
945 }
946
947 if let Ok(resp) = self
949 .client
950 .get(format!("{}/api/v0/models", self.base_url))
951 .send()
952 .await
953 {
954 if let Ok(list) = resp.json::<LmStudioList>().await {
955 let target_model = self.current_model().to_ascii_lowercase();
956 let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
958 let loaded = list
959 .data
960 .iter()
961 .find(|m| {
962 non_embed(m)
963 && m.state.as_deref() == Some("loaded")
964 && m.id
965 .as_deref()
966 .map(|id| id.eq_ignore_ascii_case(&target_model))
967 .unwrap_or(false)
968 })
969 .or_else(|| {
970 list.data
971 .iter()
972 .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
973 })
974 .or_else(|| {
975 list.data.iter().find(|m| {
976 non_embed(m)
977 && m.id
978 .as_deref()
979 .map(|id| id.eq_ignore_ascii_case(&target_model))
980 .unwrap_or(false)
981 })
982 })
983 .or_else(|| list.data.iter().find(|m| non_embed(m)));
984
985 if let Some(model) = loaded {
986 if let Some(ctx) = model.loaded_context_length {
987 if ctx > 0 {
988 return ctx as usize;
989 }
990 }
991 if let Some(ctx) = model.context_length {
992 if ctx > 0 {
993 return ctx as usize;
994 }
995 }
996 if let Some(ctx) = model.max_context_length {
997 if ctx > 0 && ctx <= 32_768 {
998 return ctx as usize;
999 }
1000 }
1001 }
1002 }
1003 }
1004
1005 if self.current_model().to_lowercase().contains("gemma-4") {
1009 return 32_768;
1010 }
1011
1012 32_768
1013 }
1014
1015 pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
1016 let previous_model = self.current_model();
1017 let previous_context = self.current_context_length();
1018
1019 let detected_model = match self.get_loaded_model().await {
1020 Some(m) if !m.is_empty() => m, Some(_) => "no model loaded".to_string(), None => previous_model.clone(), };
1024
1025 if !detected_model.is_empty() && detected_model != previous_model {
1026 if let Ok(mut guard) = self.model.write() {
1027 *guard = detected_model.clone();
1028 }
1029 }
1030
1031 let detected_context = self.detect_context_length().await;
1032 let effective_model = if detected_model.is_empty() {
1033 previous_model.clone()
1034 } else {
1035 detected_model
1036 };
1037
1038 let changed = effective_model != previous_model || detected_context != previous_context;
1039 self.set_runtime_profile(&effective_model, detected_context);
1040
1041 Some((effective_model, detected_context, changed))
1042 }
1043
1044 pub fn build_system_prompt(
1045 &self,
1046 snark: u8,
1047 chaos: u8,
1048 brief: bool,
1049 professional: bool,
1050 tools: &[ToolDefinition],
1051 reasoning_history: Option<&str>,
1052 mcp_tools: &[crate::agent::mcp::McpTool],
1053 ) -> String {
1054 let mut sys = self.build_system_prompt_legacy(
1055 snark,
1056 chaos,
1057 brief,
1058 professional,
1059 tools,
1060 reasoning_history,
1061 );
1062
1063 if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1064 sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1065 sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1066 for tool in mcp_tools {
1067 let description = tool
1068 .description
1069 .as_deref()
1070 .unwrap_or("No description provided.");
1071 sys.push_str(&format!("- {}: {}\n", tool.name, description));
1072 }
1073 }
1074
1075 sys
1076 }
1077
1078 pub fn build_system_prompt_legacy(
1079 &self,
1080 snark: u8,
1081 _chaos: u8,
1082 brief: bool,
1083 professional: bool,
1084 tools: &[ToolDefinition],
1085 reasoning_history: Option<&str>,
1086 ) -> String {
1087 let current_context_length = self.current_context_length();
1088 if is_tiny_context_window(current_context_length) {
1089 return self.build_system_prompt_tiny(brief, professional);
1090 }
1091 if is_compact_context_window(current_context_length) {
1092 return self.build_system_prompt_compact(brief, professional, tools);
1093 }
1094
1095 let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1097 - You are Hematite, a local coding system working on the user's machine.\n\
1098 - The running Hematite build is ");
1099 sys.push_str(&crate::hematite_version_display());
1100 sys.push_str(".\n\
1101 - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1102 - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1103 - For simple questions, answer briefly in plain language.\n\
1104 - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1105 - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1106 - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1107 - Keep internal reasoning inside channel delimiters.\n\
1108 - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1109 <turn|>\n\n");
1110
1111 if let Some(history) = reasoning_history {
1112 if !history.is_empty() {
1113 sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1114 sys.push_str(history);
1115 sys.push_str("\n\n");
1116 }
1117 }
1118
1119 if brief {
1121 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1122 - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1123 - Depth: Surface-level verification only.\n\n");
1124 } else {
1125 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1126 - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1127 - Depth: Full multi-step derivation required.\n\n");
1128 }
1129
1130 let os = std::env::consts::OS;
1132 if professional {
1133 sys.push_str(&format!(
1134 "You are Hematite, a local coding system running on {}. \
1135 The TUI is one interface layer, not your whole identity. \
1136 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1137 Skip filler and keep the focus on the work.\n",
1138 os
1139 ));
1140 } else {
1141 sys.push_str(&format!(
1142 "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1143 The terminal UI is only one surface of the system. \
1144 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1145 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1146 self.species, snark, os
1147 ));
1148 }
1149
1150 let current_model = self.current_model();
1152 if !current_model.is_empty() {
1153 sys.push_str(&format!(
1154 "Loaded model: {} | Context window: {} tokens. \
1155 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1156 current_model, current_context_length
1157 ));
1158 if is_hematite_native_model(¤t_model) {
1159 sys.push_str(
1160 "Sovereign native note: prefer exact tool JSON with no extra prose when calling tools. \
1161 Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1162 For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1163 );
1164 }
1165 } else {
1166 sys.push_str(&format!(
1167 "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1168 current_context_length
1169 ));
1170 }
1171
1172 let shell_desc = if cfg!(target_os = "windows") {
1174 "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1175 - Use ONLY for builds, tests, or file migrations. \n\
1176 - You MUST use the `powershell` tool directly. \n\
1177 - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1178 } else {
1179 "[EXTERNAL SHELL]: `bash` (Unix).\n\
1180 - Use ONLY for builds, tests, or file migrations. \n\
1181 - NEVER wrap bash in other shells. \n\n"
1182 };
1183
1184 sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1185 [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1186 - These are the ONLY way to explore and modify code. \n\
1187 - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1188 sys.push_str(shell_desc);
1189
1190 sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1192 SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1193
1194 sys.push_str("## TURN ADVISORY\n");
1196 if brief {
1197 sys.push_str("- BRIEF MODE: Respond with ONE concise sentence/block unless more code is required.\n");
1198 }
1199 sys.push_str("- INTERNAL REASONING: Plan your move in the thought channel first.\n");
1200
1201 sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1203 2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1204 3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1205 4. Fix all errors before declaring success.\n\n\
1206 ## PRE-FLIGHT SCOPING PROTOCOL\n\
1207 Before attempting any multi-file task or complex refactor:\n\
1208 1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1209 2. Use `auto_pin_context` to keep those files in active context.\n\
1210 3. Only then proceed to deeper edits or research.\n\n\
1211 ## REFACTORING PROTOCOL\n\
1212 When modifying existing code or renaming symbols:\n\
1213 1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1214 2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1215 3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1216
1217 sys.push_str(&load_instruction_files());
1219
1220 sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1222
1223 if !tools.is_empty() {
1225 sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1226 for tool in tools {
1227 let schema = serde_json::to_string(&tool.function.parameters)
1228 .unwrap_or_else(|_| "{}".to_string());
1229 sys.push_str(&format!(
1230 "<|tool>declaration:{}{}{}<tool|>\n",
1231 tool.function.name, "{", schema
1232 ));
1233 sys.push_str(&format!("// {})\n", tool.function.description));
1234 }
1235 }
1236
1237 sys
1238 }
1239
1240 fn build_system_prompt_compact(
1241 &self,
1242 brief: bool,
1243 professional: bool,
1244 tools: &[ToolDefinition],
1245 ) -> String {
1246 let current_model = self.current_model();
1249 let current_context_length = self.current_context_length();
1250 let os = std::env::consts::OS;
1251
1252 let mut sys = String::from("<|turn>system\n<|think|>\n");
1253 sys.push_str(&format!(
1254 "You are Hematite {}, a local coding harness working on the user's machine.\n",
1255 crate::hematite_version_display()
1256 ));
1257 if professional {
1258 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1259 } else {
1260 sys.push_str(&format!(
1261 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1262 self.species
1263 ));
1264 }
1265 sys.push_str(&format!(
1266 "Model: {} | Context: {} tokens. Keep turns focused.\n",
1267 current_model, current_context_length
1268 ));
1269 if is_hematite_native_model(¤t_model) {
1270 sys.push_str(
1271 "Sovereign native: use exact tool JSON. No extra prose in tool calls. \
1272 Raw regex patterns in grep_files, no slash delimiters.\n",
1273 );
1274 }
1275 if cfg!(target_os = "windows") {
1276 sys.push_str(&format!(
1277 "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1278 os
1279 ));
1280 } else {
1281 sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1282 }
1283 if brief {
1284 sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1285 }
1286
1287 sys.push_str(
1288 "\nCORE RULES:\n\
1289 - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1290 - Verify after edits: run `verify_build` after code changes, before committing.\n\
1291 - One tool at a time. Do not batch unrelated tool calls.\n\
1292 - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1293 - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1294 - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1295 );
1296
1297 if !tools.is_empty() {
1298 sys.push_str("\n# AVAILABLE TOOLS\n");
1299 for tool in tools {
1300 let desc: String = tool.function.description.chars().take(120).collect();
1301 sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1302 }
1303 }
1304
1305 sys.push_str("<turn|>\n");
1306 sys
1307 }
1308
1309 fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1310 let current_model = self.current_model();
1311 let current_context_length = self.current_context_length();
1312 let os = std::env::consts::OS;
1313 let mut sys = format!(
1314 "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1315 crate::hematite_version_display()
1316 );
1317 if professional {
1318 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1319 } else {
1320 sys.push_str(&format!(
1321 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1322 self.species
1323 ));
1324 }
1325 if !current_model.is_empty() {
1326 sys.push_str(&format!(
1327 "Loaded model: {} | Context window: {} tokens.\n",
1328 current_model, current_context_length
1329 ));
1330 } else {
1331 sys.push_str(&format!(
1332 "Context window: {} tokens.\n",
1333 current_context_length
1334 ));
1335 }
1336 sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1337 sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1338 sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1339 if cfg!(target_os = "windows") {
1340 sys.push_str(&format!(
1341 "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1342 os
1343 ));
1344 } else {
1345 sys.push_str(&format!(
1346 "You are running on {}. Use the native Unix shell conventions.\n",
1347 os
1348 ));
1349 }
1350 if brief {
1351 sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1352 }
1353 if is_hematite_native_model(¤t_model) {
1354 sys.push_str(
1355 "Sovereign native note: use exact tool JSON with no extra prose when calling tools.\n",
1356 );
1357 }
1358 sys.push_str("<turn|>\n");
1359 sys
1360 }
1361
1362 pub async fn call_with_tools(
1367 &self,
1368 messages: &[ChatMessage],
1369 tools: &[ToolDefinition],
1370 model_override: Option<&str>,
1372 ) -> Result<
1373 (
1374 Option<String>,
1375 Option<Vec<ToolCallResponse>>,
1376 Option<TokenUsage>,
1377 Option<String>,
1378 ),
1379 String,
1380 > {
1381 let _permit = self
1382 .kv_semaphore
1383 .acquire()
1384 .await
1385 .map_err(|e| e.to_string())?;
1386
1387 let current_model = self.current_model();
1388 let model = model_override.unwrap_or(current_model.as_str()).to_string();
1389 let filtered_tools = if cfg!(target_os = "windows") {
1390 tools
1391 .iter()
1392 .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1393 .cloned()
1394 .collect::<Vec<_>>()
1395 } else {
1396 tools.to_vec()
1397 };
1398
1399 let request_messages = if should_use_native_formatting(self, &model) {
1400 prepare_gemma_native_messages(messages)
1401 } else {
1402 messages.to_vec()
1403 };
1404
1405 const COMPACT_CORE_TOOLS: &[&str] = &[
1410 "read_file",
1411 "inspect_lines",
1412 "edit_file",
1413 "write_file",
1414 "grep_files",
1415 "list_files",
1416 "verify_build",
1417 "shell",
1418 ];
1419 let effective_tools = if is_compact_context_window(self.current_context_length()) {
1420 let core: Vec<_> = filtered_tools
1421 .iter()
1422 .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1423 .cloned()
1424 .collect();
1425 if core.is_empty() {
1426 None
1427 } else {
1428 Some(core)
1429 }
1430 } else if filtered_tools.is_empty() {
1431 None
1432 } else {
1433 Some(filtered_tools)
1434 };
1435
1436 let request = ChatRequest {
1437 model: model.clone(),
1438 messages: request_messages,
1439 temperature: 0.2,
1440 stream: false,
1441 tools: effective_tools,
1442 };
1443
1444 preflight_chat_request(
1446 &model,
1447 &request.messages,
1448 request.tools.as_deref().unwrap_or(&[]),
1449 self.current_context_length(),
1450 )?;
1451
1452 let mut last_err = String::new();
1453 let mut response_opt: Option<reqwest::Response> = None;
1454 for attempt in 0..3u32 {
1455 match self.client.post(&self.api_url).json(&request).send().await {
1456 Ok(res) if res.status().is_success() => {
1457 response_opt = Some(res);
1458 break;
1459 }
1460 Ok(res) if res.status().as_u16() >= 500 => {
1461 last_err = format!("LM Studio error {}", res.status());
1462 }
1463 Ok(res) => {
1464 let status = res.status();
1466 let body = res.text().await.unwrap_or_default();
1467 let preview = &body[..body.len().min(300)];
1468 return Err(format!("LM Studio error {}: {}", status, preview));
1469 }
1470 Err(e) if e.is_timeout() || e.is_connect() => {
1471 last_err = format!("Request failed: {}", e);
1472 }
1473 Err(e) => return Err(format!("Request failed: {}", e)),
1474 }
1475 if attempt < 2 {
1476 let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1477 tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1478 }
1479 }
1480 let res = response_opt
1481 .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1482
1483 let body: ChatResponse = res
1484 .json()
1485 .await
1486 .map_err(|e| format!("Response parse error: {}", e))?;
1487
1488 if let Some(usage) = &body.usage {
1489 let mut econ = self.economics.lock().unwrap();
1490 econ.input_tokens += usage.prompt_tokens;
1491 econ.output_tokens += usage.completion_tokens;
1492 }
1493
1494 let choice = body
1495 .choices
1496 .into_iter()
1497 .next()
1498 .ok_or_else(|| "Empty response from model".to_string())?;
1499
1500 let finish_reason = choice.finish_reason;
1501 let mut tool_calls = choice.message.tool_calls;
1502 let mut content = choice.message.content;
1503
1504 if let Some(raw_content) = &content {
1507 let native_calls = extract_native_tool_calls(raw_content);
1508 if !native_calls.is_empty() {
1509 let mut existing = tool_calls.unwrap_or_default();
1510 existing.extend(native_calls);
1511 tool_calls = Some(existing);
1512 let stripped = strip_native_tool_call_text(raw_content);
1513 content = if stripped.trim().is_empty() {
1514 None
1515 } else {
1516 Some(stripped)
1517 };
1518 }
1519 }
1520
1521 if is_hematite_native_model(&model) {
1522 if let Some(calls) = tool_calls.as_mut() {
1523 for call in calls.iter_mut() {
1524 call.function.arguments = normalize_tool_argument_string(
1525 &call.function.name,
1526 &call.function.arguments,
1527 );
1528 }
1529 }
1530 }
1531
1532 let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1537 if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1538 && content
1539 .as_ref()
1540 .map(|s| s.trim().is_empty())
1541 .unwrap_or(true)
1542 && !reasoning_text.is_empty()
1543 {
1544 let recovered = extract_native_tool_calls(&reasoning_text);
1545 if !recovered.is_empty() {
1546 tool_calls = Some(recovered);
1547 content = None;
1549 } else if finish_reason.as_deref() == Some("stop") {
1550 content = Some(reasoning_text);
1556 }
1557 }
1558
1559 Ok((content, tool_calls, body.usage, finish_reason))
1560 }
1561
1562 pub async fn stream_messages(
1566 &self,
1567 messages: &[ChatMessage],
1568 tx: mpsc::Sender<InferenceEvent>,
1569 ) -> Result<(), Box<dyn std::error::Error>> {
1570 let current_model = self.current_model();
1571 let request_messages = if should_use_native_formatting(self, ¤t_model) {
1572 prepare_gemma_native_messages(messages)
1573 } else {
1574 messages
1575 .iter()
1576 .map(|m| {
1577 let mut clone = m.clone();
1578 let current_text = m.content.as_str();
1579 if !current_text.starts_with("<|turn>") {
1580 clone.content = MessageContent::Text(format!(
1581 "<|turn>{}\n{}\n<turn|>",
1582 m.role, current_text
1583 ));
1584 }
1585 clone
1586 })
1587 .collect()
1588 };
1589
1590 let request = ChatRequest {
1591 model: current_model.clone(),
1592 messages: request_messages,
1593 temperature: 0.7,
1594 stream: true,
1595 tools: None,
1596 };
1597
1598 if let Err(e) = preflight_chat_request(
1599 ¤t_model,
1600 &request.messages,
1601 &[],
1602 self.current_context_length(),
1603 ) {
1604 let tag = classify_runtime_failure_tag(&e);
1605 let _ = tx
1606 .send(InferenceEvent::ProviderStatus {
1607 state: provider_state_for_failure_tag(tag),
1608 summary: compact_runtime_failure_summary(tag, &e),
1609 })
1610 .await;
1611 let _ = tx
1612 .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1613 .await;
1614 let _ = tx.send(InferenceEvent::Done).await;
1615 return Ok(());
1616 }
1617
1618 let mut last_err = String::new();
1619 let mut response_opt: Option<reqwest::Response> = None;
1620 for attempt in 0..2u32 {
1621 match self.client.post(&self.api_url).json(&request).send().await {
1622 Ok(res) if res.status().is_success() => {
1623 response_opt = Some(res);
1624 break;
1625 }
1626 Ok(res) if res.status().as_u16() >= 500 => {
1627 last_err = format!("LM Studio error {}", res.status());
1628 }
1629 Ok(res) => {
1630 let status = res.status();
1631 let body = res.text().await.unwrap_or_default();
1632 let preview = &body[..body.len().min(300)];
1633 let detail = format!("LM Studio error {}: {}", status, preview);
1634 let tag = classify_runtime_failure_tag(&detail);
1635 let _ = tx
1636 .send(InferenceEvent::ProviderStatus {
1637 state: provider_state_for_failure_tag(tag),
1638 summary: compact_runtime_failure_summary(tag, &detail),
1639 })
1640 .await;
1641 let _ = tx
1642 .send(InferenceEvent::Error(format_runtime_failure_message(
1643 &detail,
1644 )))
1645 .await;
1646 let _ = tx.send(InferenceEvent::Done).await;
1647 return Ok(());
1648 }
1649 Err(e) if e.is_timeout() || e.is_connect() => {
1650 last_err = format!("Request failed: {}", e);
1651 }
1652 Err(e) => {
1653 let detail = format!("Request failed: {}", e);
1654 let tag = classify_runtime_failure_tag(&detail);
1655 let _ = tx
1656 .send(InferenceEvent::ProviderStatus {
1657 state: provider_state_for_failure_tag(tag),
1658 summary: compact_runtime_failure_summary(tag, &detail),
1659 })
1660 .await;
1661 let _ = tx
1662 .send(InferenceEvent::Error(format_runtime_failure_message(
1663 &detail,
1664 )))
1665 .await;
1666 let _ = tx.send(InferenceEvent::Done).await;
1667 return Ok(());
1668 }
1669 }
1670 if attempt < 1 {
1671 let _ = tx
1672 .send(InferenceEvent::ProviderStatus {
1673 state: ProviderRuntimeState::Recovering,
1674 summary: "LM Studio degraded during stream startup; retrying once.".into(),
1675 })
1676 .await;
1677 tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1678 }
1679 }
1680 let Some(res) = response_opt else {
1681 let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1682 let tag = classify_runtime_failure_tag(&detail);
1683 let _ = tx
1684 .send(InferenceEvent::ProviderStatus {
1685 state: provider_state_for_failure_tag(tag),
1686 summary: compact_runtime_failure_summary(tag, &detail),
1687 })
1688 .await;
1689 let _ = tx
1690 .send(InferenceEvent::Error(format_runtime_failure_message(
1691 &detail,
1692 )))
1693 .await;
1694 let _ = tx.send(InferenceEvent::Done).await;
1695 return Ok(());
1696 };
1697
1698 use futures::StreamExt;
1699 let mut byte_stream = res.bytes_stream();
1700
1701 let mut line_buffer = String::new();
1704 let mut content_buffer = String::new();
1705 let mut past_think = false;
1706 let mut emitted_any_content = false;
1707 let mut emitted_live_status = false;
1708
1709 loop {
1712 let next = tokio::select! {
1713 chunk = byte_stream.next() => chunk,
1715 _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1716 if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1717 break;
1718 }
1719 continue;
1720 }
1721 };
1722
1723 let Some(item) = next else { break };
1724
1725 let chunk = match item {
1726 Ok(chunk) => chunk,
1727 Err(e) => {
1728 let detail = format!("Request failed: {}", e);
1729 let tag = classify_runtime_failure_tag(&detail);
1730 let _ = tx
1731 .send(InferenceEvent::ProviderStatus {
1732 state: provider_state_for_failure_tag(tag),
1733 summary: compact_runtime_failure_summary(tag, &detail),
1734 })
1735 .await;
1736 let _ = tx
1737 .send(InferenceEvent::Error(format_runtime_failure_message(
1738 &detail,
1739 )))
1740 .await;
1741 let _ = tx.send(InferenceEvent::Done).await;
1742 return Ok(());
1743 }
1744 };
1745 line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1746
1747 while let Some(pos) = line_buffer.find("\n\n") {
1748 let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1749 let data_pos = match event_str.find("data: ") {
1750 Some(p) => p,
1751 None => continue,
1752 };
1753
1754 let data = event_str[data_pos + 6..].trim();
1755 if data == "[DONE]" {
1756 break;
1757 }
1758
1759 if let Ok(json) = serde_json::from_str::<Value>(data) {
1760 let delta = &json["choices"][0]["delta"];
1761
1762 if let Some(reasoning) = delta["reasoning_content"]
1764 .as_str()
1765 .or_else(|| delta["thought"].as_str())
1766 {
1767 if !reasoning.is_empty() {
1768 past_think = false; content_buffer.push_str(reasoning);
1770 if content_buffer.len() > 30
1771 && (reasoning.contains('\n') || reasoning.contains('.'))
1772 {
1773 let _ = tx
1774 .send(InferenceEvent::Thought(content_buffer.clone()))
1775 .await;
1776 emitted_any_content = true;
1777 content_buffer.clear();
1778 }
1779 }
1780 }
1781
1782 if let Some(content) = delta["content"].as_str() {
1784 if content.is_empty() {
1785 continue;
1786 }
1787
1788 if !past_think && !content_buffer.is_empty() && !content.trim().is_empty() {
1791 let _ = tx
1794 .send(InferenceEvent::Thought(content_buffer.clone()))
1795 .await;
1796 content_buffer.clear();
1797 past_think = true;
1798 }
1799
1800 if !past_think {
1801 let lc = content.to_lowercase();
1802 let close = lc
1803 .find("<channel|>")
1804 .map(|i| (i, "<channel|>".len()))
1805 .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1806
1807 if let Some((tag_start, tag_len)) = close {
1808 let before = &content[..tag_start];
1810 content_buffer.push_str(before);
1811 if !content_buffer.trim().is_empty() {
1812 let _ = tx
1813 .send(InferenceEvent::Thought(content_buffer.clone()))
1814 .await;
1815 emitted_any_content = true;
1816 }
1817 content_buffer.clear();
1818
1819 past_think = true;
1820 let after = content[tag_start + tag_len..].trim_start_matches('\n');
1821 content_buffer.push_str(after);
1822 } else {
1823 content_buffer.push_str(content);
1825 if content_buffer.len() > 30
1826 && (content.contains('\n') || content.contains('.'))
1827 {
1828 let _ = tx
1829 .send(InferenceEvent::Thought(content_buffer.clone()))
1830 .await;
1831 emitted_any_content = true;
1832 content_buffer.clear();
1833 }
1834 }
1835 } else {
1836 content_buffer.push_str(content);
1838 let is_boundary = content.contains(' ')
1839 || content.contains('.')
1840 || content.contains('!')
1841 || content.contains('?');
1842
1843 if content_buffer.len() > 10 && is_boundary {
1844 if !emitted_live_status {
1845 let _ = tx
1846 .send(InferenceEvent::ProviderStatus {
1847 state: ProviderRuntimeState::Live,
1848 summary: String::new(),
1849 })
1850 .await;
1851 emitted_live_status = true;
1852 }
1853 let _ =
1854 tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1855 emitted_any_content = true;
1856 content_buffer.clear();
1857 }
1858 }
1859 }
1860 }
1861 }
1862 }
1863
1864 if !content_buffer.is_empty() {
1866 if past_think {
1867 if !emitted_live_status {
1868 let _ = tx
1869 .send(InferenceEvent::ProviderStatus {
1870 state: ProviderRuntimeState::Live,
1871 summary: String::new(),
1872 })
1873 .await;
1874 }
1875 let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1876 } else {
1877 let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1878 }
1879 emitted_any_content = true;
1880 }
1881
1882 if !emitted_any_content {
1883 let _ = tx
1884 .send(InferenceEvent::ProviderStatus {
1885 state: ProviderRuntimeState::EmptyResponse,
1886 summary: compact_runtime_failure_summary(
1887 "empty_model_response",
1888 "Empty response from model",
1889 ),
1890 })
1891 .await;
1892 let _ = tx
1893 .send(InferenceEvent::Error(format_runtime_failure_message(
1894 "Empty response from model",
1895 )))
1896 .await;
1897 let _ = tx.send(InferenceEvent::Done).await;
1898 return Ok(());
1899 }
1900
1901 let _ = tx.send(InferenceEvent::Done).await;
1902 Ok(())
1903 }
1904
1905 pub async fn stream_generation(
1907 &self,
1908 prompt: &str,
1909 snark: u8,
1910 chaos: u8,
1911 brief: bool,
1912 professional: bool,
1913 tx: mpsc::Sender<InferenceEvent>,
1914 ) -> Result<(), Box<dyn std::error::Error>> {
1915 let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1916 let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1917 self.stream_messages(&messages, tx).await
1918 }
1919
1920 pub async fn generate_task_worker(
1924 &self,
1925 prompt: &str,
1926 professional: bool,
1927 ) -> Result<String, String> {
1928 let current_model = self.current_model();
1929 let model = self
1930 .worker_model
1931 .as_deref()
1932 .unwrap_or(current_model.as_str());
1933 self.generate_task_with_model(prompt, 0.1, professional, model)
1934 .await
1935 }
1936
1937 pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1938 self.generate_task_with_temp(prompt, 0.1, professional)
1939 .await
1940 }
1941
1942 pub async fn generate_task_with_temp(
1943 &self,
1944 prompt: &str,
1945 temp: f32,
1946 professional: bool,
1947 ) -> Result<String, String> {
1948 let current_model = self.current_model();
1949 self.generate_task_with_model(prompt, temp, professional, ¤t_model)
1950 .await
1951 }
1952
1953 pub async fn generate_task_with_model(
1954 &self,
1955 prompt: &str,
1956 temp: f32,
1957 professional: bool,
1958 model: &str,
1959 ) -> Result<String, String> {
1960 let _permit = self
1961 .kv_semaphore
1962 .acquire()
1963 .await
1964 .map_err(|e| e.to_string())?;
1965
1966 let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1967 let request_messages = if should_use_native_formatting(self, model) {
1968 prepare_gemma_native_messages(&[
1969 ChatMessage::system(&system),
1970 ChatMessage::user(prompt),
1971 ])
1972 } else {
1973 vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1974 };
1975 let request = ChatRequest {
1976 model: model.to_string(),
1977 messages: request_messages,
1978 temperature: temp,
1979 stream: false,
1980 tools: None,
1981 };
1982
1983 preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1984
1985 let res = self
1986 .client
1987 .post(&self.api_url)
1988 .json(&request)
1989 .send()
1990 .await
1991 .map_err(|e| format!("LM Studio request failed: {}", e))?;
1992
1993 let body: ChatResponse = res
1994 .json()
1995 .await
1996 .map_err(|e| format!("Failed to parse response: {}", e))?;
1997
1998 body.choices
1999 .first()
2000 .and_then(|c| c.message.content.clone())
2001 .ok_or_else(|| "Empty response from model".to_string())
2002 }
2003
2004 #[allow(dead_code)]
2008 pub fn snip_history(
2009 &self,
2010 turns: &[ChatMessage],
2011 max_tokens_estimate: usize,
2012 keep_recent: usize,
2013 ) -> Vec<ChatMessage> {
2014 let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
2015 if total_chars / 4 <= max_tokens_estimate {
2016 return turns.to_vec();
2017 }
2018 let keep = keep_recent.min(turns.len());
2019 let mut snipped = vec![turns[0].clone()];
2020 if turns.len() > keep + 1 {
2021 snipped.push(ChatMessage::system(&format!(
2022 "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2023 turns.len() - keep - 1
2024 )));
2025 snipped.extend_from_slice(&turns[turns.len() - keep..]);
2026 } else {
2027 snipped = turns.to_vec();
2028 }
2029 snipped
2030 }
2031}
2032
2033fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2034 serde_json::to_vec(value)
2035 .ok()
2036 .map_or(0, |bytes| bytes.len() / 4 + 1)
2037}
2038
2039const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2040
2041fn estimate_message_tokens(message: &ChatMessage) -> usize {
2042 let content_tokens = match &message.content {
2043 MessageContent::Text(s) => s.len() / 4 + 1,
2044 MessageContent::Parts(parts) => parts
2045 .iter()
2046 .map(|part| match part {
2047 ContentPart::Text { text } => text.len() / 4 + 1,
2048 ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2051 })
2052 .sum(),
2053 };
2054 let tool_tokens: usize = message
2055 .tool_calls
2056 .iter()
2057 .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2058 .sum();
2059 content_tokens + tool_tokens + 6
2060}
2061
2062pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2063 messages.iter().map(estimate_message_tokens).sum()
2064}
2065
2066fn reserved_output_tokens(context_length: usize) -> usize {
2067 let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2068 proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2069}
2070
2071pub fn estimate_prompt_pressure(
2072 messages: &[ChatMessage],
2073 tools: &[ToolDefinition],
2074 context_length: usize,
2075) -> (usize, usize, usize, u8) {
2076 let estimated_input_tokens =
2077 estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2078 let reserved_output = reserved_output_tokens(context_length);
2079 let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2080 let percent = if context_length == 0 {
2081 0
2082 } else {
2083 ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2084 };
2085 (
2086 estimated_input_tokens,
2087 reserved_output,
2088 estimated_total,
2089 percent,
2090 )
2091}
2092
2093fn preflight_chat_request(
2094 model: &str,
2095 messages: &[ChatMessage],
2096 tools: &[ToolDefinition],
2097 context_length: usize,
2098) -> Result<(), String> {
2099 let (estimated_input_tokens, reserved_output, estimated_total, _) =
2100 estimate_prompt_pressure(messages, tools, context_length);
2101
2102 if estimated_total > context_length {
2103 return Err(format!(
2104 "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2105 model, estimated_input_tokens, reserved_output, estimated_total, context_length
2106 ));
2107 }
2108
2109 Ok(())
2110}
2111
2112fn load_instruction_files() -> String {
2116 use std::collections::hash_map::DefaultHasher;
2117 use std::collections::HashSet;
2118 use std::hash::{Hash, Hasher};
2119
2120 let Ok(cwd) = std::env::current_dir() else {
2121 return String::new();
2122 };
2123 let mut result = String::new();
2124 let mut seen: HashSet<u64> = HashSet::new();
2125 let mut total_chars: usize = 0;
2126 const MAX_TOTAL: usize = 12_000;
2127 const MAX_PER_FILE: usize = 4_000;
2128
2129 let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2130
2131 let mut dir = cwd.clone();
2132 for _ in 0..4 {
2133 for name in &candidates {
2134 let path = dir.join(name);
2135 if !path.exists() {
2136 continue;
2137 }
2138 let Ok(content) = std::fs::read_to_string(&path) else {
2139 continue;
2140 };
2141 if content.trim().is_empty() {
2142 continue;
2143 }
2144
2145 let mut hasher = DefaultHasher::new();
2146 content.hash(&mut hasher);
2147 let h = hasher.finish();
2148 if !seen.insert(h) {
2149 continue;
2150 }
2151
2152 let truncated = if content.len() > MAX_PER_FILE {
2153 format!("{}...[truncated]", &content[..MAX_PER_FILE])
2154 } else {
2155 content
2156 };
2157
2158 if total_chars + truncated.len() > MAX_TOTAL {
2159 break;
2160 }
2161 total_chars += truncated.len();
2162 result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2163 }
2164 match dir.parent().map(|p| p.to_owned()) {
2165 Some(p) => dir = p,
2166 None => break,
2167 }
2168 }
2169
2170 if result.is_empty() {
2171 return String::new();
2172 }
2173 format!("\n\n# Project Instructions\n{}", result)
2174}
2175
2176pub fn extract_think_block(text: &str) -> Option<String> {
2177 let lower = text.to_lowercase();
2178
2179 let open_tag = "<|channel>thought";
2181 let close_tag = "<channel|>";
2182
2183 let start_pos = lower.find(open_tag)?;
2184 let content_start = start_pos + open_tag.len();
2185
2186 let close_pos = lower[content_start..]
2187 .find(close_tag)
2188 .map(|p| content_start + p)
2189 .unwrap_or(text.len());
2190
2191 let content = text[content_start..close_pos].trim();
2192 if content.is_empty() {
2193 None
2194 } else {
2195 Some(content.to_string())
2196 }
2197}
2198
2199pub fn strip_think_blocks(text: &str) -> String {
2200 let text = {
2204 let t = text.trim_start();
2205 if t.to_lowercase().starts_with("</think>") {
2206 &t[8..]
2207 } else {
2208 text
2209 }
2210 };
2211
2212 let lower = text.to_lowercase();
2213
2214 if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2216 let answer = text[end..]
2217 .replace("<|channel>thought", "")
2218 .replace("<channel|>", "");
2219 return answer.trim().replace("\n\n\n", "\n\n").to_string();
2220 }
2221
2222 let first_open = [
2224 lower.find("<|channel>thought"), lower.find("<think>"),
2226 lower.find("<thought>"),
2227 lower.find("<|think|>"),
2228 ]
2229 .iter()
2230 .filter_map(|&x| x)
2231 .min();
2232
2233 if let Some(start) = first_open {
2234 if start > 0 {
2235 return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2236 }
2237 return String::new();
2238 }
2239
2240 let naked_reasoning_phrases: &[&str] = &[
2244 "the user asked",
2245 "the user is asking",
2246 "the user wants",
2247 "i will structure",
2248 "i should provide",
2249 "i should give",
2250 "i should avoid",
2251 "i should note",
2252 "i should focus",
2253 "i should keep",
2254 "i should respond",
2255 "i should present",
2256 "i should display",
2257 "i should show",
2258 "i need to",
2259 "i can see from",
2260 "without being overly",
2261 "let me ",
2262 "necessary information in my identity",
2263 "was computed successfully",
2264 "computed successfully",
2265 ];
2266 let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2267 if is_naked_reasoning {
2268 let lines: Vec<&str> = text.lines().collect();
2269 if !lines.is_empty() {
2270 let mut start_idx = 0;
2273 for (i, line) in lines.iter().enumerate() {
2274 let l = line.to_lowercase();
2275 let is_reasoning_line =
2276 naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2277 if is_reasoning_line {
2278 start_idx = i + 1;
2279 } else {
2280 break;
2281 }
2282 }
2283 if start_idx < lines.len() {
2284 return lines[start_idx..]
2285 .join("\n")
2286 .trim()
2287 .replace("\n\n\n", "\n\n")
2288 .to_string();
2289 }
2290 return String::new();
2292 }
2293 }
2294
2295 let cleaned = strip_xml_tool_call_artifacts(text);
2298 cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2299}
2300
2301fn strip_xml_tool_call_artifacts(text: &str) -> String {
2304 const XML_ARTIFACTS: &[&str] = &[
2306 "</tool_call>",
2307 "<tool_call>",
2308 "</function>",
2309 "<function>",
2310 "</parameter>",
2311 "<parameter>",
2312 "</arguments>",
2313 "<arguments>",
2314 "</tool_use>",
2315 "<tool_use>",
2316 "</invoke>",
2317 "<invoke>",
2318 "</think>",
2320 "</thought>",
2321 "</thinking>",
2322 ];
2323 let mut out = text.to_string();
2324 for tag in XML_ARTIFACTS {
2325 while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2327 out.drain(pos..pos + tag.len());
2328 }
2329 }
2330 out
2332}
2333
2334pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2337 use regex::Regex;
2338 let mut results = Vec::new();
2339
2340 let re_call = Regex::new(
2342 r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2343 ).unwrap();
2344 let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2345
2346 for cap in re_call.captures_iter(text) {
2347 let name = cap[1].to_string();
2348 let args_str = &cap[2];
2349 let mut arguments = serde_json::Map::new();
2350
2351 for arg_cap in re_arg.captures_iter(args_str) {
2352 let key = arg_cap[1].to_string();
2353 let val_raw = arg_cap
2354 .get(2)
2355 .map(|m| m.as_str())
2356 .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2357 .unwrap_or("")
2358 .trim();
2359 let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2360
2361 let val = if normalized_raw == "true" {
2362 Value::Bool(true)
2363 } else if normalized_raw == "false" {
2364 Value::Bool(false)
2365 } else if let Ok(n) = normalized_raw.parse::<i64>() {
2366 Value::Number(n.into())
2367 } else if let Ok(n) = normalized_raw.parse::<u64>() {
2368 Value::Number(n.into())
2369 } else if let Ok(n) = normalized_raw.parse::<f64>() {
2370 serde_json::Number::from_f64(n)
2371 .map(Value::Number)
2372 .unwrap_or(Value::String(normalized_raw.clone()))
2373 } else {
2374 Value::String(normalized_raw)
2375 };
2376
2377 arguments.insert(key, val);
2378 }
2379
2380 results.push(ToolCallResponse {
2381 id: format!("call_{}", rand::random::<u32>()),
2382 call_type: "function".to_string(),
2383 function: ToolCallFn {
2384 name,
2385 arguments: Value::Object(arguments).to_string(),
2386 },
2387 });
2388 }
2389
2390 let re_xml_call = Regex::new(
2392 r#"(?s)<tool_call>\s*<function=([A-Za-z_][A-Za-z0-9_]*)>(.*?)(?:</function>)?\s*</tool_call>"#
2393 ).unwrap();
2394 let re_xml_param =
2395 Regex::new(r#"(?s)<parameter=([A-Za-z_][A-Za-z0-9_]*)>(.*?)</parameter>"#).unwrap();
2396
2397 for cap in re_xml_call.captures_iter(text) {
2398 let name = cap[1].to_string();
2399 let body = &cap[2];
2400 let mut arguments = serde_json::Map::new();
2401
2402 for p_cap in re_xml_param.captures_iter(body) {
2403 let key = p_cap[1].to_string();
2404 let val_raw = p_cap[2].trim();
2405 let val = if val_raw == "true" {
2406 Value::Bool(true)
2407 } else if val_raw == "false" {
2408 Value::Bool(false)
2409 } else if let Ok(n) = val_raw.parse::<i64>() {
2410 Value::Number(n.into())
2411 } else if let Ok(n) = val_raw.parse::<u64>() {
2412 Value::Number(n.into())
2413 } else {
2414 Value::String(val_raw.to_string())
2415 };
2416 arguments.insert(key, val);
2417 }
2418
2419 results.push(ToolCallResponse {
2420 id: format!("call_{}", rand::random::<u32>()),
2421 call_type: "function".to_string(),
2422 function: ToolCallFn {
2423 name,
2424 arguments: Value::Object(arguments).to_string(),
2425 },
2426 });
2427 }
2428
2429 results
2430}
2431
2432pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2433 let trimmed = raw.trim();
2434 let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2435
2436 let mut value = match serde_json::from_str::<Value>(&candidate) {
2437 Ok(v) => v,
2438 Err(_) => return candidate,
2439 };
2440 normalize_tool_argument_value(tool_name, &mut value);
2441 value.to_string()
2442}
2443
2444fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2445 match value {
2446 Value::String(s) => *s = normalize_string_arg(s),
2447 Value::Array(items) => {
2448 for item in items {
2449 normalize_tool_argument_value(tool_name, item);
2450 }
2451 }
2452 Value::Object(map) => {
2453 for val in map.values_mut() {
2454 normalize_tool_argument_value(tool_name, val);
2455 }
2456 if tool_name == "grep_files" {
2457 if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2458 *pattern = normalize_regex_pattern(pattern);
2459 }
2460 }
2461 for key in ["path", "extension", "query", "command", "reason"] {
2462 if let Some(Value::String(s)) = map.get_mut(key) {
2463 *s = normalize_string_arg(s);
2464 }
2465 }
2466 }
2467 _ => {}
2468 }
2469}
2470
2471fn unwrap_json_string_once(input: &str) -> Option<String> {
2472 if input.len() < 2 {
2473 return None;
2474 }
2475 let first = input.chars().next()?;
2476 let last = input.chars().last()?;
2477 if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2478 return None;
2479 }
2480 let inner = &input[1..input.len() - 1];
2481 let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2482 Some(unescaped.trim().to_string())
2483}
2484
2485fn normalize_string_arg(input: &str) -> String {
2486 let mut out = input.trim().to_string();
2487 while out.len() >= 2 {
2488 let mut changed = false;
2489 for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2490 if out.starts_with(start) && out.ends_with(end) {
2491 out = out[start.len()..out.len() - end.len()].trim().to_string();
2492 changed = true;
2493 break;
2494 }
2495 }
2496 if !changed {
2497 break;
2498 }
2499 }
2500 out
2501}
2502
2503fn normalize_regex_pattern(input: &str) -> String {
2504 let out = normalize_string_arg(input);
2505 if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2506 out[1..out.len() - 1].to_string()
2507 } else {
2508 out
2509 }
2510}
2511
2512fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2513 let mut system_blocks = Vec::new();
2514 let mut prepared = Vec::new();
2515 let mut seeded = false;
2516
2517 for message in messages {
2518 if message.role == "system" {
2519 let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2520 .trim()
2521 .to_string();
2522 if !cleaned.is_empty() {
2523 system_blocks.push(cleaned);
2524 }
2525 continue;
2526 }
2527
2528 let mut clone = message.clone();
2529 clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2530
2531 if !seeded && message.role == "user" {
2532 let mut merged = String::new();
2533 if !system_blocks.is_empty() {
2534 merged.push_str("System instructions for this turn:\n");
2535 merged.push_str(&system_blocks.join("\n\n"));
2536 merged.push_str("\n\n");
2537 }
2538 merged.push_str(clone.content.as_str());
2539 clone.content = MessageContent::Text(merged);
2540 seeded = true;
2541 }
2542
2543 prepared.push(clone);
2544 }
2545
2546 if !seeded && !system_blocks.is_empty() {
2547 prepared.insert(
2548 0,
2549 ChatMessage::user(&format!(
2550 "System instructions for this turn:\n{}",
2551 system_blocks.join("\n\n")
2552 )),
2553 );
2554 }
2555
2556 prepared
2557}
2558
2559fn strip_legacy_turn_wrappers(text: &str) -> String {
2560 text.replace("<|turn>system\n", "")
2561 .replace("<|turn>user\n", "")
2562 .replace("<|turn>assistant\n", "")
2563 .replace("<|turn>tool\n", "")
2564 .replace("<turn|>", "")
2565 .trim()
2566 .to_string()
2567}
2568
2569pub fn strip_native_tool_call_text(text: &str) -> String {
2570 use regex::Regex;
2571 let re_call = Regex::new(
2573 r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2574 ).unwrap();
2575 let re_xml = Regex::new(r#"(?s)<tool_call>\s*<function=.*?>.*?</tool_call>"#).unwrap();
2577 let re_response =
2578 Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2579 .unwrap();
2580 let without_calls = re_call.replace_all(text, "");
2581 let without_xml = re_xml.replace_all(without_calls.as_ref(), "");
2582 re_response
2583 .replace_all(without_xml.as_ref(), "")
2584 .trim()
2585 .to_string()
2586}
2587
2588#[cfg(test)]
2589mod tests {
2590 use super::*;
2591
2592 #[test]
2593 fn system_prompt_includes_running_hematite_version() {
2594 let engine = InferenceEngine::new(
2595 "http://localhost:1234/v1".to_string(),
2596 "strategist".to_string(),
2597 0,
2598 )
2599 .expect("engine");
2600
2601 let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2602 assert!(system.contains(crate::HEMATITE_VERSION));
2603 }
2604
2605 #[test]
2606 fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2607 let text = r#"<|channel>thought
2608Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2609<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2610
2611 let calls = extract_native_tool_calls(text);
2612 assert_eq!(calls.len(), 1);
2613 assert_eq!(calls[0].function.name, "read_file");
2614
2615 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2616 assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2617 assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2618 assert_eq!(
2619 args.get("path").and_then(|v| v.as_str()),
2620 Some("src/ui/tui.rs")
2621 );
2622
2623 let stripped = strip_native_tool_call_text(text);
2624 assert!(!stripped.contains("<|tool_call"));
2625 assert!(!stripped.contains("<tool_call|>"));
2626 }
2627
2628 #[test]
2629 fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2630 let text = r#"<|channel>thought
2631Planning.
2632<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2633Mapped src.
2634<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2635Read main.
2636<channel|>"#;
2637
2638 let calls = extract_native_tool_calls(text);
2639 assert_eq!(calls.len(), 2);
2640 assert_eq!(calls[0].function.name, "list_files");
2641 assert_eq!(calls[1].function.name, "read_file");
2642
2643 let stripped = strip_native_tool_call_text(text);
2644 assert!(!stripped.contains("<|tool_call"));
2645 assert!(!stripped.contains("<|tool_response"));
2646 assert!(!stripped.contains("<tool_response|>"));
2647 }
2648
2649 #[test]
2650 fn extracts_qwen_xml_tool_calls_from_reasoning() {
2651 let text = r#"Based on the project structure, I need to check the binary.
2652<tool_call>
2653<function=shell>
2654<parameter=command>
2655ls -la hematite.exe
2656</parameter>
2657<parameter=reason>
2658Check if the binary exists
2659</parameter>
2660</function>
2661</tool_call>"#;
2662
2663 let calls = extract_native_tool_calls(text);
2664 assert_eq!(calls.len(), 1);
2665 assert_eq!(calls[0].function.name, "shell");
2666
2667 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2668 assert_eq!(
2669 args.get("command").and_then(|v| v.as_str()),
2670 Some("ls -la hematite.exe")
2671 );
2672 assert_eq!(
2673 args.get("reason").and_then(|v| v.as_str()),
2674 Some("Check if the binary exists")
2675 );
2676
2677 let stripped = strip_native_tool_call_text(text);
2678 assert!(!stripped.contains("<tool_call>"));
2679 assert!(!stripped.contains("<function=shell>"));
2680 }
2681}