1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7pub struct InferenceEngine {
10 pub client: reqwest::Client,
11 pub api_url: String,
12 pub base_url: String,
15 pub species: String,
16 pub snark: u8,
17 pub kv_semaphore: Semaphore,
18 pub model: std::sync::RwLock<String>,
20 pub context_length: std::sync::atomic::AtomicUsize,
22 pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23 pub worker_model: Option<String>,
25 pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27 pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32 let lower = model.to_ascii_lowercase();
33 lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37 is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44 #[serde(rename = "type")]
45 pub tool_type: String,
46 pub function: ToolFunction,
47 #[serde(skip_serializing, skip_deserializing)]
48 pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53 pub name: String,
54 pub description: String,
55 pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60 RepoRead,
61 RepoWrite,
62 Runtime,
63 Architecture,
64 Toolchain,
65 Verification,
66 Git,
67 Research,
68 Vision,
69 Lsp,
70 Workflow,
71 External,
72 Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77 pub category: ToolCategory,
78 pub mutates_workspace: bool,
79 pub external_surface: bool,
80 pub trust_sensitive: bool,
81 pub read_only_friendly: bool,
82 pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86 if name.starts_with("mcp__") {
87 let lower = name.to_ascii_lowercase();
88 let mutates_workspace = [
89 "__edit",
90 "__write",
91 "__create",
92 "__move",
93 "__delete",
94 "__remove",
95 "__rename",
96 "__replace",
97 "__patch",
98 ]
99 .iter()
100 .any(|needle| lower.contains(needle));
101 return ToolMetadata {
102 category: ToolCategory::External,
103 mutates_workspace,
104 external_surface: true,
105 trust_sensitive: true,
106 read_only_friendly: !mutates_workspace,
107 plan_scope: false,
108 };
109 }
110
111 match name {
112 "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113 category: ToolCategory::RepoRead,
114 mutates_workspace: false,
115 external_surface: false,
116 trust_sensitive: false,
117 read_only_friendly: true,
118 plan_scope: true,
119 },
120 "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121 category: ToolCategory::RepoWrite,
122 mutates_workspace: true,
123 external_surface: false,
124 trust_sensitive: true,
125 read_only_friendly: false,
126 plan_scope: true,
127 },
128 "map_project" | "trace_runtime_flow" => ToolMetadata {
129 category: ToolCategory::Architecture,
130 mutates_workspace: false,
131 external_surface: false,
132 trust_sensitive: false,
133 read_only_friendly: true,
134 plan_scope: false,
135 },
136 "describe_toolchain" => ToolMetadata {
137 category: ToolCategory::Toolchain,
138 mutates_workspace: false,
139 external_surface: false,
140 trust_sensitive: false,
141 read_only_friendly: true,
142 plan_scope: false,
143 },
144 "shell" => ToolMetadata {
145 category: ToolCategory::Runtime,
146 mutates_workspace: true,
147 external_surface: false,
148 trust_sensitive: true,
149 read_only_friendly: false,
150 plan_scope: false,
151 },
152 "inspect_host" => ToolMetadata {
153 category: ToolCategory::Runtime,
154 mutates_workspace: false,
155 external_surface: false,
156 trust_sensitive: false,
157 read_only_friendly: true,
158 plan_scope: false,
159 },
160 "verify_build" => ToolMetadata {
161 category: ToolCategory::Verification,
162 mutates_workspace: false,
163 external_surface: false,
164 trust_sensitive: false,
165 read_only_friendly: true,
166 plan_scope: false,
167 },
168 "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
169 ToolMetadata {
170 category: ToolCategory::Git,
171 mutates_workspace: true,
172 external_surface: false,
173 trust_sensitive: true,
174 read_only_friendly: false,
175 plan_scope: false,
176 }
177 }
178 "research_web" | "fetch_docs" => ToolMetadata {
179 category: ToolCategory::Research,
180 mutates_workspace: false,
181 external_surface: false,
182 trust_sensitive: false,
183 read_only_friendly: true,
184 plan_scope: false,
185 },
186 "vision_analyze" => ToolMetadata {
187 category: ToolCategory::Vision,
188 mutates_workspace: false,
189 external_surface: false,
190 trust_sensitive: false,
191 read_only_friendly: true,
192 plan_scope: false,
193 },
194 "lsp_definitions"
195 | "lsp_references"
196 | "lsp_hover"
197 | "lsp_rename_symbol"
198 | "lsp_get_diagnostics"
199 | "lsp_search_symbol" => ToolMetadata {
200 category: ToolCategory::Lsp,
201 mutates_workspace: false,
202 external_surface: false,
203 trust_sensitive: false,
204 read_only_friendly: true,
205 plan_scope: false,
206 },
207 "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
208 category: ToolCategory::Workflow,
209 mutates_workspace: false,
210 external_surface: false,
211 trust_sensitive: false,
212 read_only_friendly: true,
213 plan_scope: true,
214 },
215 "manage_tasks" => ToolMetadata {
216 category: ToolCategory::Workflow,
217 mutates_workspace: false,
218 external_surface: false,
219 trust_sensitive: false,
220 read_only_friendly: true,
221 plan_scope: false,
222 },
223 _ => ToolMetadata {
224 category: ToolCategory::Other,
225 mutates_workspace: false,
226 external_surface: false,
227 trust_sensitive: false,
228 read_only_friendly: true,
229 plan_scope: false,
230 },
231 }
232}
233
234#[derive(Serialize, Deserialize, Clone, Debug)]
239pub struct ChatMessage {
240 pub role: String,
241 pub content: MessageContent,
243 #[serde(default, skip_serializing_if = "Vec::is_empty")]
245 pub tool_calls: Vec<ToolCallResponse>,
246 #[serde(skip_serializing_if = "Option::is_none")]
248 pub tool_call_id: Option<String>,
249 #[serde(skip_serializing_if = "Option::is_none")]
251 pub name: Option<String>,
252}
253
254#[derive(Serialize, Deserialize, Clone, Debug)]
255#[serde(untagged)]
256pub enum MessageContent {
257 Text(String),
258 Parts(Vec<ContentPart>),
259}
260
261#[derive(Serialize, Deserialize, Clone, Debug)]
262#[serde(tag = "type")]
263pub enum ContentPart {
264 #[serde(rename = "text")]
265 Text { text: String },
266 #[serde(rename = "image_url")]
267 ImageUrl { image_url: ImageUrlSource },
268}
269
270#[derive(Serialize, Deserialize, Clone, Debug)]
271pub struct ImageUrlSource {
272 pub url: String,
273}
274
275impl Default for MessageContent {
276 fn default() -> Self {
277 MessageContent::Text(String::new())
278 }
279}
280
281impl MessageContent {
282 pub fn as_str(&self) -> &str {
283 match self {
284 MessageContent::Text(s) => s,
285 MessageContent::Parts(parts) => {
286 for part in parts {
287 if let ContentPart::Text { text } = part {
288 return text;
289 }
290 }
291 ""
292 }
293 }
294 }
295}
296
297impl ChatMessage {
298 pub fn system(content: &str) -> Self {
299 Self {
300 role: "system".into(),
301 content: MessageContent::Text(content.into()),
302 tool_calls: Vec::new(),
303 tool_call_id: None,
304 name: None,
305 }
306 }
307 pub fn user(content: &str) -> Self {
308 Self {
309 role: "user".into(),
310 content: MessageContent::Text(content.into()),
311 tool_calls: Vec::new(),
312 tool_call_id: None,
313 name: None,
314 }
315 }
316 pub fn user_with_image(text: &str, image_url: &str) -> Self {
317 let mut text_parts = text.to_string();
318 if !text_parts.contains("<|image|>") {
319 text_parts.push_str(" <|image|>");
320 }
321 Self {
322 role: "user".into(),
323 content: MessageContent::Parts(vec![
324 ContentPart::Text { text: text_parts },
325 ContentPart::ImageUrl {
326 image_url: ImageUrlSource {
327 url: image_url.into(),
328 },
329 },
330 ]),
331 tool_calls: Vec::new(),
332 tool_call_id: None,
333 name: None,
334 }
335 }
336 pub fn assistant_text(content: &str) -> Self {
337 Self {
338 role: "assistant".into(),
339 content: MessageContent::Text(content.into()),
340 tool_calls: Vec::new(),
341 tool_call_id: None,
342 name: None,
343 }
344 }
345 pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
346 Self {
347 role: "assistant".into(),
348 content: MessageContent::Text(content.into()),
349 tool_calls: calls,
350 tool_call_id: None,
351 name: None,
352 }
353 }
354 pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
355 Self::tool_result_for_model(tool_call_id, fn_name, content, "")
356 }
357
358 pub fn tool_result_for_model(
361 tool_call_id: &str,
362 fn_name: &str,
363 content: &str,
364 model: &str,
365 ) -> Self {
366 let body = if is_gemma4_model_name(model) {
367 format!(
368 "<|tool_response>response:{}{}{}<tool_response|>",
369 fn_name, "{", content
370 )
371 } else {
372 content.to_string()
373 };
374 Self {
375 role: "tool".into(),
376 content: MessageContent::Text(body),
377 tool_calls: Vec::new(),
378 tool_call_id: Some(tool_call_id.into()),
379 name: Some(fn_name.into()),
380 }
381 }
382}
383
384#[derive(Serialize, Deserialize, Clone, Debug)]
387pub struct ToolCallResponse {
388 pub id: String,
389 #[serde(rename = "type")]
390 pub call_type: String,
391 pub function: ToolCallFn,
392}
393
394#[derive(Serialize, Deserialize, Clone, Debug)]
395pub struct ToolCallFn {
396 pub name: String,
397 pub arguments: String,
399}
400
401#[derive(Serialize)]
404struct ChatRequest {
405 model: String,
406 messages: Vec<ChatMessage>,
407 temperature: f32,
408 stream: bool,
409 #[serde(skip_serializing_if = "Option::is_none")]
410 tools: Option<Vec<ToolDefinition>>,
411}
412
413#[derive(Deserialize, Debug)]
414struct ChatResponse {
415 choices: Vec<ResponseChoice>,
416 usage: Option<TokenUsage>,
417}
418
419#[derive(Deserialize, Debug, Clone)]
420pub struct TokenUsage {
421 pub prompt_tokens: usize,
422 pub completion_tokens: usize,
423 pub total_tokens: usize,
424 #[serde(default)]
425 pub prompt_cache_hit_tokens: usize,
426 #[serde(default)]
427 pub cache_read_input_tokens: usize,
428}
429
430#[derive(Deserialize, Debug)]
431struct ResponseChoice {
432 message: ResponseMessage,
433 #[serde(default)]
434 finish_reason: Option<String>,
435}
436
437#[derive(Deserialize, Debug)]
438struct ResponseMessage {
439 content: Option<String>,
440 tool_calls: Option<Vec<ToolCallResponse>>,
441}
442
443const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
444const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
445
446fn is_tiny_context_window(context_length: usize) -> bool {
447 context_length <= 8_192
448}
449
450fn is_compact_context_window(context_length: usize) -> bool {
451 context_length > 8_192 && context_length <= 49_152
452}
453
454pub fn is_compact_context_window_pub(context_length: usize) -> bool {
455 is_compact_context_window(context_length)
456}
457
458fn is_provider_context_limit_detail(lower: &str) -> bool {
459 (lower.contains("n_keep") && lower.contains("n_ctx"))
460 || lower.contains("context length")
461 || lower.contains("keep from the initial prompt")
462 || lower.contains("prompt is greater than the context length")
463 || lower.contains("exceeds the context window")
464}
465
466fn classify_runtime_failure_tag(detail: &str) -> &'static str {
467 let lower = detail.to_ascii_lowercase();
468 if lower.contains("context_window_blocked")
469 || lower.contains("context ceiling reached")
470 || lower.contains("exceeds the")
471 || is_provider_context_limit_detail(&lower)
472 {
473 "context_window"
474 } else if lower.contains("empty response from model")
475 || lower.contains("model returned an empty response")
476 {
477 "empty_model_response"
478 } else if lower.contains("action blocked:")
479 || lower.contains("access denied")
480 || lower.contains("declined by user")
481 {
482 "tool_policy_blocked"
483 } else {
484 "provider_degraded"
485 }
486}
487
488fn runtime_failure_guidance(tag: &str) -> &'static str {
489 match tag {
490 "context_window" => {
491 "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
492 }
493 "empty_model_response" => {
494 "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
495 }
496 "tool_policy_blocked" => {
497 "Stay inside the allowed workflow or switch modes before retrying."
498 }
499 _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
500 }
501}
502
503fn format_runtime_failure_message(detail: &str) -> String {
504 let tag = classify_runtime_failure_tag(detail);
505 format!(
506 "[failure:{}] {} Detail: {}",
507 tag,
508 runtime_failure_guidance(tag),
509 detail.trim()
510 )
511}
512
513#[derive(Debug, Clone, Copy, PartialEq, Eq)]
514pub enum ProviderRuntimeState {
515 Booting,
516 Live,
517 Recovering,
518 Degraded,
519 ContextWindow,
520 EmptyResponse,
521}
522
523#[derive(Debug, Clone, Copy, PartialEq, Eq)]
524pub enum McpRuntimeState {
525 Unconfigured,
526 Healthy,
527 Degraded,
528 Failed,
529}
530
531#[derive(Debug, Clone, Copy, PartialEq, Eq)]
532pub enum OperatorCheckpointState {
533 Idle,
534 RecoveringProvider,
535 BudgetReduced,
536 HistoryCompacted,
537 BlockedContextWindow,
538 BlockedPolicy,
539 BlockedRecentFileEvidence,
540 BlockedExactLineWindow,
541 BlockedToolLoop,
542 BlockedVerification,
543}
544
545impl OperatorCheckpointState {
546 pub fn label(self) -> &'static str {
547 match self {
548 OperatorCheckpointState::Idle => "idle",
549 OperatorCheckpointState::RecoveringProvider => "recovering_provider",
550 OperatorCheckpointState::BudgetReduced => "budget_reduced",
551 OperatorCheckpointState::HistoryCompacted => "history_compacted",
552 OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
553 OperatorCheckpointState::BlockedPolicy => "blocked_policy",
554 OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
555 OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
556 OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
557 OperatorCheckpointState::BlockedVerification => "blocked_verification",
558 }
559 }
560}
561
562fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
563 match tag {
564 "context_window" => ProviderRuntimeState::ContextWindow,
565 "empty_model_response" => ProviderRuntimeState::EmptyResponse,
566 _ => ProviderRuntimeState::Degraded,
567 }
568}
569
570fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
571 match tag {
572 "context_window" => {
573 "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
574 .to_string()
575 }
576 "empty_model_response" => {
577 "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
578 .to_string()
579 }
580 "tool_policy_blocked" => {
581 "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
582 .to_string()
583 }
584 _ => {
585 let mut excerpt = detail
586 .split_whitespace()
587 .take(12)
588 .collect::<Vec<_>>()
589 .join(" ");
590 if excerpt.len() > 110 {
591 excerpt.truncate(110);
592 excerpt.push_str("...");
593 }
594 if excerpt.is_empty() {
595 "LM Studio degraded; Hematite will retry once before surfacing a failure."
596 .to_string()
597 } else {
598 format!("LM Studio degraded: {}", excerpt)
599 }
600 }
601 }
602}
603
604#[derive(Debug)]
607pub enum InferenceEvent {
608 Token(String),
610 MutedToken(String),
612 Thought(String),
614 VoiceStatus(String),
616 ToolCallStart {
618 id: String,
619 name: String,
620 args: String,
621 },
622 ToolCallResult {
624 id: String,
625 name: String,
626 output: String,
627 is_error: bool,
628 },
629 ApprovalRequired {
633 id: String,
634 name: String,
635 display: String,
636 diff: Option<String>,
639 responder: tokio::sync::oneshot::Sender<bool>,
640 },
641 Done,
643 Error(String),
645 ProviderStatus {
647 state: ProviderRuntimeState,
648 summary: String,
649 },
650 OperatorCheckpoint {
652 state: OperatorCheckpointState,
653 summary: String,
654 },
655 RecoveryRecipe { summary: String },
657 McpStatus {
659 state: McpRuntimeState,
660 summary: String,
661 },
662 CompactionPressure {
664 estimated_tokens: usize,
665 threshold_tokens: usize,
666 percent: u8,
667 },
668 PromptPressure {
670 estimated_input_tokens: usize,
671 reserved_output_tokens: usize,
672 estimated_total_tokens: usize,
673 context_length: usize,
674 percent: u8,
675 },
676 TaskProgress {
678 id: String,
679 label: String,
680 progress: u8,
681 },
682 UsageUpdate(TokenUsage),
684 RuntimeProfile {
686 model_id: String,
687 context_length: usize,
688 },
689 VeinStatus {
691 file_count: usize,
692 embedded_count: usize,
693 docs_only: bool,
694 },
695 VeinContext { paths: Vec<String> },
698 SoulReroll {
700 species: String,
701 rarity: String,
702 shiny: bool,
703 personality: String,
704 },
705 EmbedProfile { model_id: Option<String> },
707}
708
709impl InferenceEngine {
712 pub fn new(
713 api_url: String,
714 species: String,
715 snark: u8,
716 ) -> Result<Self, Box<dyn std::error::Error>> {
717 let client = reqwest::Client::builder()
718 .timeout(std::time::Duration::from_secs(180))
719 .build()?;
720
721 let base_url = {
723 let trimmed = api_url.trim_end_matches('/');
724 if let Some(scheme_end) = trimmed.find("://") {
725 let after_scheme = &trimmed[scheme_end + 3..];
726 if let Some(path_start) = after_scheme.find('/') {
727 format!(
728 "{}://{}",
729 &trimmed[..scheme_end],
730 &after_scheme[..path_start]
731 )
732 } else {
733 trimmed.to_string()
734 }
735 } else {
736 trimmed.to_string()
737 }
738 };
739
740 let api_url = if api_url.ends_with("/chat/completions") {
741 api_url
742 } else if api_url.ends_with("/") {
743 format!("{}chat/completions", api_url)
744 } else {
745 format!("{}/chat/completions", api_url)
746 };
747
748 Ok(Self {
749 client,
750 api_url,
751 base_url,
752 species,
753 snark,
754 kv_semaphore: Semaphore::new(3),
755 model: std::sync::RwLock::new(String::new()),
756 context_length: std::sync::atomic::AtomicUsize::new(32_768), economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
758 worker_model: None,
759 gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
760 cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
761 })
762 }
763
764 pub fn set_gemma_native_formatting(&self, enabled: bool) {
765 self.gemma_native_formatting
766 .store(enabled, std::sync::atomic::Ordering::SeqCst);
767 }
768
769 pub fn gemma_native_formatting_enabled(&self) -> bool {
770 self.gemma_native_formatting
771 .load(std::sync::atomic::Ordering::SeqCst)
772 }
773
774 pub fn current_model(&self) -> String {
775 self.model.read().map(|g| g.clone()).unwrap_or_default()
776 }
777
778 pub fn current_context_length(&self) -> usize {
779 self.context_length
780 .load(std::sync::atomic::Ordering::SeqCst)
781 }
782
783 pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
784 if let Ok(mut guard) = self.model.write() {
785 *guard = model.to_string();
786 }
787 self.context_length
788 .store(context_length, std::sync::atomic::Ordering::SeqCst);
789 }
790
791 pub async fn health_check(&self) -> bool {
793 let url = format!("{}/v1/models", self.base_url);
794 match self.client.get(&url).send().await {
795 Ok(resp) => resp.status().is_success(),
796 Err(_) => false,
797 }
798 }
799
800 pub async fn get_loaded_model(&self) -> Option<String> {
808 #[derive(Deserialize)]
809 struct ModelList {
810 data: Vec<ModelEntry>,
811 }
812 #[derive(Deserialize)]
813 struct ModelEntry {
814 id: String,
815 #[serde(rename = "type", default)]
816 model_type: String,
817 #[serde(default)]
818 state: String,
819 }
820
821 if let Ok(resp) = self
823 .client
824 .get(format!("{}/api/v0/models", self.base_url))
825 .send()
826 .await
827 {
828 if let Ok(list) = resp.json::<ModelList>().await {
829 let chat_model = list
830 .data
831 .into_iter()
832 .find(|m| m.model_type != "embeddings" && m.state == "loaded")
833 .map(|m| m.id)
834 .unwrap_or_default();
835 return Some(chat_model);
836 }
837 }
838
839 let resp = self
841 .client
842 .get(format!("{}/v1/models", self.base_url))
843 .send()
844 .await
845 .ok()?;
846 let list: ModelList = resp.json().await.ok()?;
847 Some(
848 list.data
849 .into_iter()
850 .find(|m| !m.id.to_lowercase().contains("embed"))
851 .map(|m| m.id)
852 .unwrap_or_default(),
853 )
854 }
855
856 pub async fn get_embedding_model(&self) -> Option<String> {
862 #[derive(Deserialize)]
863 struct ModelList {
864 data: Vec<ModelEntry>,
865 }
866 #[derive(Deserialize)]
867 struct ModelEntry {
868 id: String,
869 #[serde(rename = "type", default)]
870 model_type: String,
871 #[serde(default)]
872 state: String,
873 }
874 let resp = self
875 .client
876 .get(format!("{}/api/v0/models", self.base_url))
877 .send()
878 .await
879 .ok()?;
880 let list: ModelList = resp.json().await.ok()?;
881 list.data
882 .into_iter()
883 .find(|m| m.model_type == "embeddings" && m.state == "loaded")
884 .map(|m| m.id)
885 }
886
887 pub async fn detect_context_length(&self) -> usize {
893 #[derive(Deserialize)]
894 struct LmStudioModel {
895 id: Option<String>,
896 #[serde(rename = "type", default)]
897 model_type: String,
898 state: Option<String>,
899 loaded_context_length: Option<u64>,
900 context_length: Option<u64>,
901 max_context_length: Option<u64>,
902 }
903 #[derive(Deserialize)]
904 struct LmStudioList {
905 data: Vec<LmStudioModel>,
906 }
907
908 if let Ok(resp) = self
910 .client
911 .get(format!("{}/api/v0/models", self.base_url))
912 .send()
913 .await
914 {
915 if let Ok(list) = resp.json::<LmStudioList>().await {
916 let target_model = self.current_model().to_ascii_lowercase();
917 let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
919 let loaded = list
920 .data
921 .iter()
922 .find(|m| {
923 non_embed(m)
924 && m.state.as_deref() == Some("loaded")
925 && m.id
926 .as_deref()
927 .map(|id| id.eq_ignore_ascii_case(&target_model))
928 .unwrap_or(false)
929 })
930 .or_else(|| {
931 list.data
932 .iter()
933 .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
934 })
935 .or_else(|| {
936 list.data.iter().find(|m| {
937 non_embed(m)
938 && m.id
939 .as_deref()
940 .map(|id| id.eq_ignore_ascii_case(&target_model))
941 .unwrap_or(false)
942 })
943 })
944 .or_else(|| list.data.iter().find(|m| non_embed(m)));
945
946 if let Some(model) = loaded {
947 if let Some(ctx) = model.loaded_context_length {
948 if ctx > 0 {
949 return ctx as usize;
950 }
951 }
952 if let Some(ctx) = model.context_length {
953 if ctx > 0 {
954 return ctx as usize;
955 }
956 }
957 if let Some(ctx) = model.max_context_length {
958 if ctx > 0 && ctx <= 32_768 {
959 return ctx as usize;
960 }
961 }
962 }
963 }
964 }
965
966 if self.current_model().to_lowercase().contains("gemma-4") {
970 return 32_768;
971 }
972
973 32_768
974 }
975
976 pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
977 let previous_model = self.current_model();
978 let previous_context = self.current_context_length();
979
980 let detected_model = match self.get_loaded_model().await {
981 Some(m) if !m.is_empty() => m, Some(_) => "no model loaded".to_string(), None => previous_model.clone(), };
985
986 if !detected_model.is_empty() && detected_model != previous_model {
987 if let Ok(mut guard) = self.model.write() {
988 *guard = detected_model.clone();
989 }
990 }
991
992 let detected_context = self.detect_context_length().await;
993 let effective_model = if detected_model.is_empty() {
994 previous_model.clone()
995 } else {
996 detected_model
997 };
998
999 let changed = effective_model != previous_model || detected_context != previous_context;
1000 self.set_runtime_profile(&effective_model, detected_context);
1001
1002 Some((effective_model, detected_context, changed))
1003 }
1004
1005 pub fn build_system_prompt(
1006 &self,
1007 snark: u8,
1008 chaos: u8,
1009 brief: bool,
1010 professional: bool,
1011 tools: &[ToolDefinition],
1012 reasoning_history: Option<&str>,
1013 mcp_tools: &[crate::agent::mcp::McpTool],
1014 ) -> String {
1015 let mut sys = self.build_system_prompt_legacy(
1016 snark,
1017 chaos,
1018 brief,
1019 professional,
1020 tools,
1021 reasoning_history,
1022 );
1023
1024 if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1025 sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1026 sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1027 for tool in mcp_tools {
1028 let description = tool
1029 .description
1030 .as_deref()
1031 .unwrap_or("No description provided.");
1032 sys.push_str(&format!("- {}: {}\n", tool.name, description));
1033 }
1034 }
1035
1036 sys
1037 }
1038
1039 pub fn build_system_prompt_legacy(
1040 &self,
1041 snark: u8,
1042 _chaos: u8,
1043 brief: bool,
1044 professional: bool,
1045 tools: &[ToolDefinition],
1046 reasoning_history: Option<&str>,
1047 ) -> String {
1048 let current_context_length = self.current_context_length();
1049 if is_tiny_context_window(current_context_length) {
1050 return self.build_system_prompt_tiny(brief, professional);
1051 }
1052 if is_compact_context_window(current_context_length) {
1053 return self.build_system_prompt_compact(brief, professional, tools);
1054 }
1055
1056 let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1058 - You are Hematite, a local coding system working on the user's machine.\n\
1059 - The running Hematite build is ");
1060 sys.push_str(&crate::hematite_version_display());
1061 sys.push_str(".\n\
1062 - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1063 - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1064 - For simple questions, answer briefly in plain language.\n\
1065 - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1066 - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1067 - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1068 - Keep internal reasoning inside channel delimiters.\n\
1069 - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1070 <turn|>\n\n");
1071
1072 if let Some(history) = reasoning_history {
1073 if !history.is_empty() {
1074 sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1075 sys.push_str(history);
1076 sys.push_str("\n\n");
1077 }
1078 }
1079
1080 if brief {
1082 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1083 - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1084 - Depth: Surface-level verification only.\n\n");
1085 } else {
1086 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1087 - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1088 - Depth: Full multi-step derivation required.\n\n");
1089 }
1090
1091 let os = std::env::consts::OS;
1093 if professional {
1094 sys.push_str(&format!(
1095 "You are Hematite, a local coding system running on {}. \
1096 The TUI is one interface layer, not your whole identity. \
1097 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1098 Skip filler and keep the focus on the work.\n",
1099 os
1100 ));
1101 } else {
1102 sys.push_str(&format!(
1103 "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1104 The terminal UI is only one surface of the system. \
1105 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1106 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1107 self.species, snark, os
1108 ));
1109 }
1110
1111 let current_model = self.current_model();
1113 if !current_model.is_empty() {
1114 sys.push_str(&format!(
1115 "Loaded model: {} | Context window: {} tokens. \
1116 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1117 current_model, current_context_length
1118 ));
1119 if is_gemma4_model_name(¤t_model) {
1120 sys.push_str(
1121 "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1122 Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1123 For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1124 );
1125 }
1126 } else {
1127 sys.push_str(&format!(
1128 "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1129 current_context_length
1130 ));
1131 }
1132
1133 let shell_desc = if cfg!(target_os = "windows") {
1135 "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1136 - Use ONLY for builds, tests, or file migrations. \n\
1137 - You MUST use the `powershell` tool directly. \n\
1138 - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1139 } else {
1140 "[EXTERNAL SHELL]: `bash` (Unix).\n\
1141 - Use ONLY for builds, tests, or file migrations. \n\
1142 - NEVER wrap bash in other shells. \n\n"
1143 };
1144
1145 sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1146 [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1147 - These are the ONLY way to explore and modify code. \n\
1148 - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1149 sys.push_str(shell_desc);
1150
1151 sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1153 SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1154
1155 if brief {
1156 sys.push_str(
1157 "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1158 );
1159 }
1160
1161 if cfg!(target_os = "windows") {
1162 sys.push_str("Shell Protocol: You are running on WINDOWS. You MUST NOT use 'bash' or '/dev/null'. \
1163 You MUST use 'powershell' (pwsh) for all shell tasks. \
1164 DO NOT attempt to manipulate Linux-style paths like /dev, /etc, or /sys.\n\n");
1165 } else if cfg!(target_os = "macos") {
1166 sys.push_str(
1167 "Shell Protocol: You are running on macOS. Use 'bash' or 'zsh' for shell tasks. \
1168 Standard Unix paths apply.\n\n",
1169 );
1170 } else {
1171 sys.push_str(
1172 "Shell Protocol: You are running on Linux. Use 'bash' for shell tasks. \
1173 Standard Unix paths apply.\n\n",
1174 );
1175 }
1176
1177 sys.push_str("OUTPUT RULES:\n\
1178 1. Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1179 2. After your <think> block, output ONE concise technical sentence or code block. Nothing else.\n\
1180 3. Do NOT call tools named 'thought', 'think', 'reasoning', or any meta-cognitive name. These are not tools.\n\
1181 4. NEGATIVE CONSTRAINT: Never use a string containing a dot (.), slash (/), or backslash (\\) as a tool name. Paths are NOT tools.\n\
1182 5. NEGATIVE CONSTRAINT: Never use the name of a class, struct, or module as a tool name unless it is explicitly in the tool list.\n\
1183 6. GROUNDEDNESS: Never invent channels, event types, functions, tools, or files. If a detail is not verified from the repo or tool output, say `uncertain`.\n\
1184 7. TRACE QUESTIONS: For architecture or control-flow questions, prefer verified file and function names over high-level summaries.\n\
1185 8. If `trace_runtime_flow` fully answers the runtime question, preserve its identifiers exactly. Do not restyle or rename symbols from that tool output.\n\
1186 9. For generic capability questions, answer from stable Hematite capabilities. Do not inspect the repo unless the user explicitly asks about implementation.\n\
1187 10. Never infer language support, project support, or internet capability from unrelated crates or config files.\n\
1188 11. It is fine to say Hematite itself is written in Rust when relevant, but do not imply that capability is limited to Rust projects.\n\
1189 12. For language questions, answer at the harness level: file operations, shell, build verification, language-aware tooling when available, and multi-language project work.\n\
1190 13. Prefer real programming language examples like Python, JavaScript, TypeScript, Go, and C# over file extensions when answering language questions.\n\
1191 14. For project-building questions, talk about scaffolding, implementation, builds, tests, and iteration across different stacks instead of defaulting to a Rust-only example like `cargo build`.\n\
1192 15. Never mention raw `mcp__*` tool names unless those tools are active this turn and directly relevant.\n\
1193 16. For tooling-discipline or best-tool-selection questions, prefer `describe_toolchain` over improvising the tool surface from memory.\n\
1194 17. If `describe_toolchain` fully answers the tooling question, preserve its tool names and investigation order exactly.\n\
1195 18. PROOF BEFORE ACTION: Before editing an existing file, gather recent evidence with `read_file` or `inspect_lines` on that path or keep it pinned in active context.\n\
1196 18a. GREP BEFORE READ: For files over ~200 lines, always `grep_files` for a specific pattern to find the target line range BEFORE calling `read_file`. Never read a large file top-to-bottom — use offset+limit to read only the relevant window once grep gives you the line number.\n\
1197 19. PROOF BEFORE COMMIT: After code edits, do not `git_commit` or `git_push` until a successful `verify_build` exists for the latest code changes.\n\
1198 20. RISKY SHELL DISCIPLINE: Risky `shell` calls must include a concrete `reason` argument explaining what is being verified or changed.\n\
1199 21. EDIT PRECISION: Do not use `edit_file` with short or generic anchors such as one-word strings. Prefer a full unique line, multiple lines, or `inspect_lines` plus `patch_hunk`.\n\
1200 22. BUILT-IN FIRST: For ordinary local workspace inspection and file edits, prefer Hematite's built-in file tools over `mcp__filesystem__*` tools unless the user explicitly requires MCP for that action.\n\
1201 22a. HOST INSPECTION PRIORITY: For read-only questions about installed tools, PATH entries, desktop items, Downloads size, or directory summaries, prefer `inspect_host` over raw `shell` when it can answer directly.");
1202
1203 sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1205 2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1206 3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1207 4. Fix all errors before declaring success.\n\n\
1208 ## PRE-FLIGHT SCOPING PROTOCOL\n\
1209 Before attempting any multi-file task or complex refactor:\n\
1210 1. Use `map_project` to understand the project structure.\n\
1211 2. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1212 3. Use `auto_pin_context` to keep those files in active context.\n\
1213 4. Only then proceed to deeper edits or research.\n\n\
1214 ## REFACTORING PROTOCOL\n\
1215 When modifying existing code or renaming symbols:\n\
1216 1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1217 2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1218 3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1219
1220 sys.push_str(&load_instruction_files());
1222
1223 sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1225
1226 if !tools.is_empty() {
1228 sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1229 for tool in tools {
1230 let schema = serde_json::to_string(&tool.function.parameters)
1231 .unwrap_or_else(|_| "{}".to_string());
1232 sys.push_str(&format!(
1233 "<|tool>declaration:{}{}{}<tool|>\n",
1234 tool.function.name, "{", schema
1235 ));
1236 sys.push_str(&format!("// {})\n", tool.function.description));
1237 }
1238 }
1239
1240 sys
1241 }
1242
1243 fn build_system_prompt_compact(
1244 &self,
1245 brief: bool,
1246 professional: bool,
1247 tools: &[ToolDefinition],
1248 ) -> String {
1249 let current_model = self.current_model();
1252 let current_context_length = self.current_context_length();
1253 let os = std::env::consts::OS;
1254
1255 let mut sys = String::from("<|turn>system\n<|think|>\n");
1256 sys.push_str(&format!(
1257 "You are Hematite {}, a local coding harness working on the user's machine.\n",
1258 crate::hematite_version_display()
1259 ));
1260 if professional {
1261 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1262 } else {
1263 sys.push_str(&format!(
1264 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1265 self.species
1266 ));
1267 }
1268 sys.push_str(&format!(
1269 "Model: {} | Context: {} tokens. Keep turns focused.\n",
1270 current_model, current_context_length
1271 ));
1272 if is_gemma4_model_name(¤t_model) {
1273 sys.push_str(
1274 "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1275 Raw regex patterns in grep_files, no slash delimiters.\n",
1276 );
1277 }
1278 if cfg!(target_os = "windows") {
1279 sys.push_str(&format!(
1280 "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1281 os
1282 ));
1283 } else {
1284 sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1285 }
1286 if brief {
1287 sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1288 }
1289
1290 sys.push_str(
1291 "\nCORE RULES:\n\
1292 - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1293 - Verify after edits: run `verify_build` after code changes, before committing.\n\
1294 - One tool at a time. Do not batch unrelated tool calls.\n\
1295 - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1296 - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1297 - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1298 );
1299
1300 if !tools.is_empty() {
1301 sys.push_str("\n# AVAILABLE TOOLS\n");
1302 for tool in tools {
1303 let desc: String = tool.function.description.chars().take(120).collect();
1304 sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1305 }
1306 }
1307
1308 sys.push_str("<turn|>\n");
1309 sys
1310 }
1311
1312 fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1313 let current_model = self.current_model();
1314 let current_context_length = self.current_context_length();
1315 let os = std::env::consts::OS;
1316 let mut sys = format!(
1317 "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1318 crate::hematite_version_display()
1319 );
1320 if professional {
1321 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1322 } else {
1323 sys.push_str(&format!(
1324 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1325 self.species
1326 ));
1327 }
1328 if !current_model.is_empty() {
1329 sys.push_str(&format!(
1330 "Loaded model: {} | Context window: {} tokens.\n",
1331 current_model, current_context_length
1332 ));
1333 } else {
1334 sys.push_str(&format!(
1335 "Context window: {} tokens.\n",
1336 current_context_length
1337 ));
1338 }
1339 sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1340 sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1341 sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1342 if cfg!(target_os = "windows") {
1343 sys.push_str(&format!(
1344 "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1345 os
1346 ));
1347 } else {
1348 sys.push_str(&format!(
1349 "You are running on {}. Use the native Unix shell conventions.\n",
1350 os
1351 ));
1352 }
1353 if brief {
1354 sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1355 }
1356 if is_gemma4_model_name(¤t_model) {
1357 sys.push_str(
1358 "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1359 );
1360 }
1361 sys.push_str("<turn|>\n");
1362 sys
1363 }
1364
1365 pub async fn call_with_tools(
1370 &self,
1371 messages: &[ChatMessage],
1372 tools: &[ToolDefinition],
1373 model_override: Option<&str>,
1375 ) -> Result<
1376 (
1377 Option<String>,
1378 Option<Vec<ToolCallResponse>>,
1379 Option<TokenUsage>,
1380 Option<String>,
1381 ),
1382 String,
1383 > {
1384 let _permit = self
1385 .kv_semaphore
1386 .acquire()
1387 .await
1388 .map_err(|e| e.to_string())?;
1389
1390 let current_model = self.current_model();
1391 let model = model_override.unwrap_or(current_model.as_str()).to_string();
1392 let filtered_tools = if cfg!(target_os = "windows") {
1393 tools
1394 .iter()
1395 .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1396 .cloned()
1397 .collect::<Vec<_>>()
1398 } else {
1399 tools.to_vec()
1400 };
1401
1402 let request_messages = if should_use_gemma_native_formatting(self, &model) {
1403 prepare_gemma_native_messages(messages)
1404 } else {
1405 messages.to_vec()
1406 };
1407
1408 const COMPACT_CORE_TOOLS: &[&str] = &[
1413 "read_file",
1414 "inspect_lines",
1415 "edit_file",
1416 "write_file",
1417 "grep_files",
1418 "list_files",
1419 "verify_build",
1420 "shell",
1421 "map_project",
1422 ];
1423 let effective_tools = if is_compact_context_window(self.current_context_length()) {
1424 let core: Vec<_> = filtered_tools
1425 .iter()
1426 .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1427 .cloned()
1428 .collect();
1429 if core.is_empty() {
1430 None
1431 } else {
1432 Some(core)
1433 }
1434 } else if filtered_tools.is_empty() {
1435 None
1436 } else {
1437 Some(filtered_tools)
1438 };
1439
1440 let request = ChatRequest {
1441 model: model.clone(),
1442 messages: request_messages,
1443 temperature: 0.2,
1444 stream: false,
1445 tools: effective_tools,
1446 };
1447
1448 preflight_chat_request(
1450 &model,
1451 &request.messages,
1452 request.tools.as_deref().unwrap_or(&[]),
1453 self.current_context_length(),
1454 )?;
1455
1456 let mut last_err = String::new();
1457 let mut response_opt: Option<reqwest::Response> = None;
1458 for attempt in 0..3u32 {
1459 match self.client.post(&self.api_url).json(&request).send().await {
1460 Ok(res) if res.status().is_success() => {
1461 response_opt = Some(res);
1462 break;
1463 }
1464 Ok(res) if res.status().as_u16() >= 500 => {
1465 last_err = format!("LM Studio error {}", res.status());
1466 }
1467 Ok(res) => {
1468 let status = res.status();
1470 let body = res.text().await.unwrap_or_default();
1471 let preview = &body[..body.len().min(300)];
1472 return Err(format!("LM Studio error {}: {}", status, preview));
1473 }
1474 Err(e) if e.is_timeout() || e.is_connect() => {
1475 last_err = format!("Request failed: {}", e);
1476 }
1477 Err(e) => return Err(format!("Request failed: {}", e)),
1478 }
1479 if attempt < 2 {
1480 let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1481 tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1482 }
1483 }
1484 let res = response_opt
1485 .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1486
1487 let body: ChatResponse = res
1488 .json()
1489 .await
1490 .map_err(|e| format!("Response parse error: {}", e))?;
1491
1492 if let Some(usage) = &body.usage {
1493 let mut econ = self.economics.lock().unwrap();
1494 econ.input_tokens += usage.prompt_tokens;
1495 econ.output_tokens += usage.completion_tokens;
1496 }
1497
1498 let choice = body
1499 .choices
1500 .into_iter()
1501 .next()
1502 .ok_or_else(|| "Empty response from model".to_string())?;
1503
1504 let finish_reason = choice.finish_reason;
1505 let mut tool_calls = choice.message.tool_calls;
1506 let mut content = choice.message.content;
1507
1508 if let Some(raw_content) = &content {
1511 let native_calls = extract_native_tool_calls(raw_content);
1512 if !native_calls.is_empty() {
1513 let mut existing = tool_calls.unwrap_or_default();
1514 existing.extend(native_calls);
1515 tool_calls = Some(existing);
1516 let stripped = strip_native_tool_call_text(raw_content);
1517 content = if stripped.trim().is_empty() {
1518 None
1519 } else {
1520 Some(stripped)
1521 };
1522 }
1523 }
1524
1525 if is_gemma4_model_name(&model) {
1526 if let Some(calls) = tool_calls.as_mut() {
1527 for call in calls.iter_mut() {
1528 call.function.arguments = normalize_tool_argument_string(
1529 &call.function.name,
1530 &call.function.arguments,
1531 );
1532 }
1533 }
1534 }
1535
1536 Ok((content, tool_calls, body.usage, finish_reason))
1537 }
1538
1539 pub async fn stream_messages(
1543 &self,
1544 messages: &[ChatMessage],
1545 tx: mpsc::Sender<InferenceEvent>,
1546 ) -> Result<(), Box<dyn std::error::Error>> {
1547 let current_model = self.current_model();
1548 let request_messages = if should_use_gemma_native_formatting(self, ¤t_model) {
1549 prepare_gemma_native_messages(messages)
1550 } else {
1551 messages
1552 .iter()
1553 .map(|m| {
1554 let mut clone = m.clone();
1555 let current_text = m.content.as_str();
1556 if !current_text.starts_with("<|turn>") {
1557 clone.content = MessageContent::Text(format!(
1558 "<|turn>{}\n{}\n<turn|>",
1559 m.role, current_text
1560 ));
1561 }
1562 clone
1563 })
1564 .collect()
1565 };
1566
1567 let request = ChatRequest {
1568 model: current_model.clone(),
1569 messages: request_messages,
1570 temperature: 0.7,
1571 stream: true,
1572 tools: None,
1573 };
1574
1575 if let Err(e) = preflight_chat_request(
1576 ¤t_model,
1577 &request.messages,
1578 &[],
1579 self.current_context_length(),
1580 ) {
1581 let tag = classify_runtime_failure_tag(&e);
1582 let _ = tx
1583 .send(InferenceEvent::ProviderStatus {
1584 state: provider_state_for_failure_tag(tag),
1585 summary: compact_runtime_failure_summary(tag, &e),
1586 })
1587 .await;
1588 let _ = tx
1589 .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1590 .await;
1591 let _ = tx.send(InferenceEvent::Done).await;
1592 return Ok(());
1593 }
1594
1595 let mut last_err = String::new();
1596 let mut response_opt: Option<reqwest::Response> = None;
1597 for attempt in 0..2u32 {
1598 match self.client.post(&self.api_url).json(&request).send().await {
1599 Ok(res) if res.status().is_success() => {
1600 response_opt = Some(res);
1601 break;
1602 }
1603 Ok(res) if res.status().as_u16() >= 500 => {
1604 last_err = format!("LM Studio error {}", res.status());
1605 }
1606 Ok(res) => {
1607 let status = res.status();
1608 let body = res.text().await.unwrap_or_default();
1609 let preview = &body[..body.len().min(300)];
1610 let detail = format!("LM Studio error {}: {}", status, preview);
1611 let tag = classify_runtime_failure_tag(&detail);
1612 let _ = tx
1613 .send(InferenceEvent::ProviderStatus {
1614 state: provider_state_for_failure_tag(tag),
1615 summary: compact_runtime_failure_summary(tag, &detail),
1616 })
1617 .await;
1618 let _ = tx
1619 .send(InferenceEvent::Error(format_runtime_failure_message(
1620 &detail,
1621 )))
1622 .await;
1623 let _ = tx.send(InferenceEvent::Done).await;
1624 return Ok(());
1625 }
1626 Err(e) if e.is_timeout() || e.is_connect() => {
1627 last_err = format!("Request failed: {}", e);
1628 }
1629 Err(e) => {
1630 let detail = format!("Request failed: {}", e);
1631 let tag = classify_runtime_failure_tag(&detail);
1632 let _ = tx
1633 .send(InferenceEvent::ProviderStatus {
1634 state: provider_state_for_failure_tag(tag),
1635 summary: compact_runtime_failure_summary(tag, &detail),
1636 })
1637 .await;
1638 let _ = tx
1639 .send(InferenceEvent::Error(format_runtime_failure_message(
1640 &detail,
1641 )))
1642 .await;
1643 let _ = tx.send(InferenceEvent::Done).await;
1644 return Ok(());
1645 }
1646 }
1647 if attempt < 1 {
1648 let _ = tx
1649 .send(InferenceEvent::ProviderStatus {
1650 state: ProviderRuntimeState::Recovering,
1651 summary: "LM Studio degraded during stream startup; retrying once.".into(),
1652 })
1653 .await;
1654 tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1655 }
1656 }
1657 let Some(res) = response_opt else {
1658 let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1659 let tag = classify_runtime_failure_tag(&detail);
1660 let _ = tx
1661 .send(InferenceEvent::ProviderStatus {
1662 state: provider_state_for_failure_tag(tag),
1663 summary: compact_runtime_failure_summary(tag, &detail),
1664 })
1665 .await;
1666 let _ = tx
1667 .send(InferenceEvent::Error(format_runtime_failure_message(
1668 &detail,
1669 )))
1670 .await;
1671 let _ = tx.send(InferenceEvent::Done).await;
1672 return Ok(());
1673 };
1674
1675 use futures::StreamExt;
1676 let mut byte_stream = res.bytes_stream();
1677
1678 let mut line_buffer = String::new();
1681 let mut content_buffer = String::new();
1682 let mut past_think = false;
1683 let mut emitted_any_content = false;
1684 let mut emitted_live_status = false;
1685
1686 while let Some(item) = byte_stream.next().await {
1687 if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1689 break;
1690 }
1691
1692 let chunk = match item {
1693 Ok(chunk) => chunk,
1694 Err(e) => {
1695 let detail = format!("Request failed: {}", e);
1696 let tag = classify_runtime_failure_tag(&detail);
1697 let _ = tx
1698 .send(InferenceEvent::ProviderStatus {
1699 state: provider_state_for_failure_tag(tag),
1700 summary: compact_runtime_failure_summary(tag, &detail),
1701 })
1702 .await;
1703 let _ = tx
1704 .send(InferenceEvent::Error(format_runtime_failure_message(
1705 &detail,
1706 )))
1707 .await;
1708 let _ = tx.send(InferenceEvent::Done).await;
1709 return Ok(());
1710 }
1711 };
1712 line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1713
1714 while let Some(pos) = line_buffer.find("\n\n") {
1715 let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1716 let data_pos = match event_str.find("data: ") {
1717 Some(p) => p,
1718 None => continue,
1719 };
1720
1721 let data = event_str[data_pos + 6..].trim();
1722 if data == "[DONE]" {
1723 break;
1724 }
1725
1726 if let Ok(json) = serde_json::from_str::<Value>(data) {
1727 if let Some(content) = json["choices"][0]["delta"]["content"].as_str() {
1728 if content.is_empty() {
1729 continue;
1730 }
1731
1732 if !past_think {
1733 let lc = content.to_lowercase();
1734 let close = lc
1735 .find("<channel|>")
1736 .map(|i| (i, "<channel|>".len()))
1737 .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1738
1739 if let Some((tag_start, tag_len)) = close {
1740 let before = &content[..tag_start];
1742 content_buffer.push_str(before);
1743 if !content_buffer.trim().is_empty() {
1744 let _ = tx
1745 .send(InferenceEvent::Thought(content_buffer.clone()))
1746 .await;
1747 emitted_any_content = true;
1748 }
1749 content_buffer.clear();
1750
1751 past_think = true;
1752 let after = content[tag_start + tag_len..].trim_start_matches('\n');
1753 content_buffer.push_str(after);
1754 } else {
1755 content_buffer.push_str(content);
1757 if content_buffer.len() > 30
1759 && (content.contains('\n') || content.contains('.'))
1760 {
1761 let _ = tx
1762 .send(InferenceEvent::Thought(content_buffer.clone()))
1763 .await;
1764 emitted_any_content = true;
1765 content_buffer.clear();
1766 }
1767 }
1768 } else {
1769 content_buffer.push_str(content);
1772 let is_boundary = content.contains(' ')
1773 || content.contains('.')
1774 || content.contains('!')
1775 || content.contains('?');
1776
1777 if content_buffer.len() > 10 && is_boundary {
1778 if !emitted_live_status {
1779 let _ = tx
1780 .send(InferenceEvent::ProviderStatus {
1781 state: ProviderRuntimeState::Live,
1782 summary: String::new(),
1783 })
1784 .await;
1785 emitted_live_status = true;
1786 }
1787 let _ =
1788 tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1789 emitted_any_content = true;
1790 content_buffer.clear();
1791 }
1792 }
1793 }
1794 }
1795 }
1796 }
1797
1798 if !content_buffer.is_empty() {
1800 if past_think {
1801 if !emitted_live_status {
1802 let _ = tx
1803 .send(InferenceEvent::ProviderStatus {
1804 state: ProviderRuntimeState::Live,
1805 summary: String::new(),
1806 })
1807 .await;
1808 }
1809 let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1810 } else {
1811 let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1812 }
1813 emitted_any_content = true;
1814 }
1815
1816 if !emitted_any_content {
1817 let _ = tx
1818 .send(InferenceEvent::ProviderStatus {
1819 state: ProviderRuntimeState::EmptyResponse,
1820 summary: compact_runtime_failure_summary(
1821 "empty_model_response",
1822 "Empty response from model",
1823 ),
1824 })
1825 .await;
1826 let _ = tx
1827 .send(InferenceEvent::Error(format_runtime_failure_message(
1828 "Empty response from model",
1829 )))
1830 .await;
1831 let _ = tx.send(InferenceEvent::Done).await;
1832 return Ok(());
1833 }
1834
1835 let _ = tx.send(InferenceEvent::Done).await;
1836 Ok(())
1837 }
1838
1839 pub async fn stream_generation(
1841 &self,
1842 prompt: &str,
1843 snark: u8,
1844 chaos: u8,
1845 brief: bool,
1846 professional: bool,
1847 tx: mpsc::Sender<InferenceEvent>,
1848 ) -> Result<(), Box<dyn std::error::Error>> {
1849 let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1850 let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1851 self.stream_messages(&messages, tx).await
1852 }
1853
1854 pub async fn generate_task_worker(
1858 &self,
1859 prompt: &str,
1860 professional: bool,
1861 ) -> Result<String, String> {
1862 let current_model = self.current_model();
1863 let model = self
1864 .worker_model
1865 .as_deref()
1866 .unwrap_or(current_model.as_str());
1867 self.generate_task_with_model(prompt, 0.1, professional, model)
1868 .await
1869 }
1870
1871 pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1872 self.generate_task_with_temp(prompt, 0.1, professional)
1873 .await
1874 }
1875
1876 pub async fn generate_task_with_temp(
1877 &self,
1878 prompt: &str,
1879 temp: f32,
1880 professional: bool,
1881 ) -> Result<String, String> {
1882 let current_model = self.current_model();
1883 self.generate_task_with_model(prompt, temp, professional, ¤t_model)
1884 .await
1885 }
1886
1887 pub async fn generate_task_with_model(
1888 &self,
1889 prompt: &str,
1890 temp: f32,
1891 professional: bool,
1892 model: &str,
1893 ) -> Result<String, String> {
1894 let _permit = self
1895 .kv_semaphore
1896 .acquire()
1897 .await
1898 .map_err(|e| e.to_string())?;
1899
1900 let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1901 let request_messages = if should_use_gemma_native_formatting(self, model) {
1902 prepare_gemma_native_messages(&[
1903 ChatMessage::system(&system),
1904 ChatMessage::user(prompt),
1905 ])
1906 } else {
1907 vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1908 };
1909 let request = ChatRequest {
1910 model: model.to_string(),
1911 messages: request_messages,
1912 temperature: temp,
1913 stream: false,
1914 tools: None,
1915 };
1916
1917 preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1918
1919 let res = self
1920 .client
1921 .post(&self.api_url)
1922 .json(&request)
1923 .send()
1924 .await
1925 .map_err(|e| format!("LM Studio request failed: {}", e))?;
1926
1927 let body: ChatResponse = res
1928 .json()
1929 .await
1930 .map_err(|e| format!("Failed to parse response: {}", e))?;
1931
1932 body.choices
1933 .first()
1934 .and_then(|c| c.message.content.clone())
1935 .ok_or_else(|| "Empty response from model".to_string())
1936 }
1937
1938 #[allow(dead_code)]
1942 pub fn snip_history(
1943 &self,
1944 turns: &[ChatMessage],
1945 max_tokens_estimate: usize,
1946 keep_recent: usize,
1947 ) -> Vec<ChatMessage> {
1948 let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
1949 if total_chars / 4 <= max_tokens_estimate {
1950 return turns.to_vec();
1951 }
1952 let keep = keep_recent.min(turns.len());
1953 let mut snipped = vec![turns[0].clone()];
1954 if turns.len() > keep + 1 {
1955 snipped.push(ChatMessage::system(&format!(
1956 "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
1957 turns.len() - keep - 1
1958 )));
1959 snipped.extend_from_slice(&turns[turns.len() - keep..]);
1960 } else {
1961 snipped = turns.to_vec();
1962 }
1963 snipped
1964 }
1965}
1966
1967fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
1968 serde_json::to_vec(value)
1969 .ok()
1970 .map_or(0, |bytes| bytes.len() / 4 + 1)
1971}
1972
1973const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
1974
1975fn estimate_message_tokens(message: &ChatMessage) -> usize {
1976 let content_tokens = match &message.content {
1977 MessageContent::Text(s) => s.len() / 4 + 1,
1978 MessageContent::Parts(parts) => parts
1979 .iter()
1980 .map(|part| match part {
1981 ContentPart::Text { text } => text.len() / 4 + 1,
1982 ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
1985 })
1986 .sum(),
1987 };
1988 let tool_tokens: usize = message
1989 .tool_calls
1990 .iter()
1991 .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
1992 .sum();
1993 content_tokens + tool_tokens + 6
1994}
1995
1996pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
1997 messages.iter().map(estimate_message_tokens).sum()
1998}
1999
2000fn reserved_output_tokens(context_length: usize) -> usize {
2001 let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2002 proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2003}
2004
2005pub fn estimate_prompt_pressure(
2006 messages: &[ChatMessage],
2007 tools: &[ToolDefinition],
2008 context_length: usize,
2009) -> (usize, usize, usize, u8) {
2010 let estimated_input_tokens =
2011 estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2012 let reserved_output = reserved_output_tokens(context_length);
2013 let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2014 let percent = if context_length == 0 {
2015 0
2016 } else {
2017 ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2018 };
2019 (
2020 estimated_input_tokens,
2021 reserved_output,
2022 estimated_total,
2023 percent,
2024 )
2025}
2026
2027fn preflight_chat_request(
2028 model: &str,
2029 messages: &[ChatMessage],
2030 tools: &[ToolDefinition],
2031 context_length: usize,
2032) -> Result<(), String> {
2033 let (estimated_input_tokens, reserved_output, estimated_total, _) =
2034 estimate_prompt_pressure(messages, tools, context_length);
2035
2036 if estimated_total > context_length {
2037 return Err(format!(
2038 "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2039 model, estimated_input_tokens, reserved_output, estimated_total, context_length
2040 ));
2041 }
2042
2043 Ok(())
2044}
2045
2046fn load_instruction_files() -> String {
2050 use std::collections::hash_map::DefaultHasher;
2051 use std::collections::HashSet;
2052 use std::hash::{Hash, Hasher};
2053
2054 let Ok(cwd) = std::env::current_dir() else {
2055 return String::new();
2056 };
2057 let mut result = String::new();
2058 let mut seen: HashSet<u64> = HashSet::new();
2059 let mut total_chars: usize = 0;
2060 const MAX_TOTAL: usize = 12_000;
2061 const MAX_PER_FILE: usize = 4_000;
2062
2063 let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2064
2065 let mut dir = cwd.clone();
2066 for _ in 0..4 {
2067 for name in &candidates {
2068 let path = dir.join(name);
2069 if !path.exists() {
2070 continue;
2071 }
2072 let Ok(content) = std::fs::read_to_string(&path) else {
2073 continue;
2074 };
2075 if content.trim().is_empty() {
2076 continue;
2077 }
2078
2079 let mut hasher = DefaultHasher::new();
2080 content.hash(&mut hasher);
2081 let h = hasher.finish();
2082 if !seen.insert(h) {
2083 continue;
2084 }
2085
2086 let truncated = if content.len() > MAX_PER_FILE {
2087 format!("{}...[truncated]", &content[..MAX_PER_FILE])
2088 } else {
2089 content
2090 };
2091
2092 if total_chars + truncated.len() > MAX_TOTAL {
2093 break;
2094 }
2095 total_chars += truncated.len();
2096 result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2097 }
2098 match dir.parent().map(|p| p.to_owned()) {
2099 Some(p) => dir = p,
2100 None => break,
2101 }
2102 }
2103
2104 if result.is_empty() {
2105 return String::new();
2106 }
2107 format!("\n\n# Project Instructions\n{}", result)
2108}
2109
2110pub fn extract_think_block(text: &str) -> Option<String> {
2111 let lower = text.to_lowercase();
2112
2113 let open_tag = "<|channel>thought";
2115 let close_tag = "<channel|>";
2116
2117 let start_pos = lower.find(open_tag)?;
2118 let content_start = start_pos + open_tag.len();
2119
2120 let close_pos = lower[content_start..]
2121 .find(close_tag)
2122 .map(|p| content_start + p)
2123 .unwrap_or(text.len());
2124
2125 let content = text[content_start..close_pos].trim();
2126 if content.is_empty() {
2127 None
2128 } else {
2129 Some(content.to_string())
2130 }
2131}
2132
2133pub fn strip_think_blocks(text: &str) -> String {
2134 let text = {
2138 let t = text.trim_start();
2139 if t.to_lowercase().starts_with("</think>") {
2140 &t[8..]
2141 } else {
2142 text
2143 }
2144 };
2145
2146 let lower = text.to_lowercase();
2147
2148 if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2150 let answer = text[end..]
2151 .replace("<|channel>thought", "")
2152 .replace("<channel|>", "");
2153 return answer.trim().replace("\n\n\n", "\n\n").to_string();
2154 }
2155
2156 let first_open = [
2158 lower.find("<|channel>thought"), lower.find("<think>"),
2160 lower.find("<thought>"),
2161 lower.find("<|think|>"),
2162 ]
2163 .iter()
2164 .filter_map(|&x| x)
2165 .min();
2166
2167 if let Some(start) = first_open {
2168 if start > 0 {
2169 return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2170 }
2171 return String::new();
2172 }
2173
2174 let naked_reasoning_phrases: &[&str] = &[
2178 "the user asked",
2179 "the user is asking",
2180 "the user wants",
2181 "i will structure",
2182 "i should provide",
2183 "i should give",
2184 "i should avoid",
2185 "i should note",
2186 "i should focus",
2187 "i should keep",
2188 "i should respond",
2189 "i should present",
2190 "i should display",
2191 "i should show",
2192 "i need to",
2193 "i can see from",
2194 "without being overly",
2195 "let me ",
2196 "necessary information in my identity",
2197 "was computed successfully",
2198 "computed successfully",
2199 ];
2200 let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2201 if is_naked_reasoning {
2202 let lines: Vec<&str> = text.lines().collect();
2203 if !lines.is_empty() {
2204 let mut start_idx = 0;
2207 for (i, line) in lines.iter().enumerate() {
2208 let l = line.to_lowercase();
2209 let is_reasoning_line =
2210 naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2211 if is_reasoning_line {
2212 start_idx = i + 1;
2213 } else {
2214 break;
2215 }
2216 }
2217 if start_idx < lines.len() {
2218 return lines[start_idx..]
2219 .join("\n")
2220 .trim()
2221 .replace("\n\n\n", "\n\n")
2222 .to_string();
2223 }
2224 return String::new();
2226 }
2227 }
2228
2229 let cleaned = strip_xml_tool_call_artifacts(text);
2232 cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2233}
2234
2235fn strip_xml_tool_call_artifacts(text: &str) -> String {
2238 const XML_ARTIFACTS: &[&str] = &[
2240 "</tool_call>",
2241 "<tool_call>",
2242 "</function>",
2243 "<function>",
2244 "</parameter>",
2245 "<parameter>",
2246 "</arguments>",
2247 "<arguments>",
2248 "</tool_use>",
2249 "<tool_use>",
2250 "</invoke>",
2251 "<invoke>",
2252 "</think>",
2254 "</thought>",
2255 "</thinking>",
2256 ];
2257 let mut out = text.to_string();
2258 for tag in XML_ARTIFACTS {
2259 while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2261 out.drain(pos..pos + tag.len());
2262 }
2263 }
2264 out
2266}
2267
2268pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2271 use regex::Regex;
2272 let mut results = Vec::new();
2273
2274 let re_call = Regex::new(
2280 r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2281 ).unwrap();
2282 let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2285
2286 for cap in re_call.captures_iter(text) {
2287 let name = cap[1].to_string();
2288 let args_str = &cap[2];
2289 let mut arguments = serde_json::Map::new();
2290
2291 for arg_cap in re_arg.captures_iter(args_str) {
2292 let key = arg_cap[1].to_string();
2293 let val_raw = arg_cap
2295 .get(2)
2296 .map(|m| m.as_str())
2297 .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2298 .unwrap_or("")
2299 .trim();
2300 let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2301
2302 let val = if normalized_raw == "true" {
2304 Value::Bool(true)
2305 } else if normalized_raw == "false" {
2306 Value::Bool(false)
2307 } else if let Ok(n) = normalized_raw.parse::<i64>() {
2308 Value::Number(n.into())
2309 } else if let Ok(n) = normalized_raw.parse::<u64>() {
2310 Value::Number(n.into())
2311 } else if let Ok(n) = normalized_raw.parse::<f64>() {
2312 serde_json::Number::from_f64(n)
2313 .map(Value::Number)
2314 .unwrap_or(Value::String(normalized_raw.clone()))
2315 } else {
2316 Value::String(normalized_raw)
2317 };
2318
2319 arguments.insert(key, val);
2320 }
2321
2322 results.push(ToolCallResponse {
2323 id: format!("call_{}", rand::random::<u32>()),
2324 call_type: "function".to_string(),
2325 function: ToolCallFn {
2326 name,
2327 arguments: Value::Object(arguments).to_string(),
2328 },
2329 });
2330 }
2331
2332 results
2333}
2334
2335pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2336 let trimmed = raw.trim();
2337 let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2338
2339 let mut value = match serde_json::from_str::<Value>(&candidate) {
2340 Ok(v) => v,
2341 Err(_) => return candidate,
2342 };
2343 normalize_tool_argument_value(tool_name, &mut value);
2344 value.to_string()
2345}
2346
2347fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2348 match value {
2349 Value::String(s) => *s = normalize_string_arg(s),
2350 Value::Array(items) => {
2351 for item in items {
2352 normalize_tool_argument_value(tool_name, item);
2353 }
2354 }
2355 Value::Object(map) => {
2356 for val in map.values_mut() {
2357 normalize_tool_argument_value(tool_name, val);
2358 }
2359 if tool_name == "grep_files" {
2360 if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2361 *pattern = normalize_regex_pattern(pattern);
2362 }
2363 }
2364 for key in ["path", "extension", "query", "command", "reason"] {
2365 if let Some(Value::String(s)) = map.get_mut(key) {
2366 *s = normalize_string_arg(s);
2367 }
2368 }
2369 }
2370 _ => {}
2371 }
2372}
2373
2374fn unwrap_json_string_once(input: &str) -> Option<String> {
2375 if input.len() < 2 {
2376 return None;
2377 }
2378 let first = input.chars().next()?;
2379 let last = input.chars().last()?;
2380 if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2381 return None;
2382 }
2383 let inner = &input[1..input.len() - 1];
2384 let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2385 Some(unescaped.trim().to_string())
2386}
2387
2388fn normalize_string_arg(input: &str) -> String {
2389 let mut out = input.trim().to_string();
2390 while out.len() >= 2 {
2391 let mut changed = false;
2392 for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2393 if out.starts_with(start) && out.ends_with(end) {
2394 out = out[start.len()..out.len() - end.len()].trim().to_string();
2395 changed = true;
2396 break;
2397 }
2398 }
2399 if !changed {
2400 break;
2401 }
2402 }
2403 out
2404}
2405
2406fn normalize_regex_pattern(input: &str) -> String {
2407 let out = normalize_string_arg(input);
2408 if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2409 out[1..out.len() - 1].to_string()
2410 } else {
2411 out
2412 }
2413}
2414
2415fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2416 let mut system_blocks = Vec::new();
2417 let mut prepared = Vec::new();
2418 let mut seeded = false;
2419
2420 for message in messages {
2421 if message.role == "system" {
2422 let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2423 .trim()
2424 .to_string();
2425 if !cleaned.is_empty() {
2426 system_blocks.push(cleaned);
2427 }
2428 continue;
2429 }
2430
2431 let mut clone = message.clone();
2432 clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2433
2434 if !seeded && message.role == "user" {
2435 let mut merged = String::new();
2436 if !system_blocks.is_empty() {
2437 merged.push_str("System instructions for this turn:\n");
2438 merged.push_str(&system_blocks.join("\n\n"));
2439 merged.push_str("\n\n");
2440 }
2441 merged.push_str(clone.content.as_str());
2442 clone.content = MessageContent::Text(merged);
2443 seeded = true;
2444 }
2445
2446 prepared.push(clone);
2447 }
2448
2449 if !seeded && !system_blocks.is_empty() {
2450 prepared.insert(
2451 0,
2452 ChatMessage::user(&format!(
2453 "System instructions for this turn:\n{}",
2454 system_blocks.join("\n\n")
2455 )),
2456 );
2457 }
2458
2459 prepared
2460}
2461
2462fn strip_legacy_turn_wrappers(text: &str) -> String {
2463 text.replace("<|turn>system\n", "")
2464 .replace("<|turn>user\n", "")
2465 .replace("<|turn>assistant\n", "")
2466 .replace("<|turn>tool\n", "")
2467 .replace("<turn|>", "")
2468 .trim()
2469 .to_string()
2470}
2471
2472pub fn strip_native_tool_call_text(text: &str) -> String {
2473 use regex::Regex;
2474 let re_call = Regex::new(
2475 r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2476 ).unwrap();
2477 let re_response =
2478 Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2479 .unwrap();
2480 let without_calls = re_call.replace_all(text, "");
2481 re_response
2482 .replace_all(without_calls.as_ref(), "")
2483 .trim()
2484 .to_string()
2485}
2486
2487#[cfg(test)]
2488mod tests {
2489 use super::*;
2490
2491 #[test]
2492 fn system_prompt_includes_running_hematite_version() {
2493 let engine = InferenceEngine::new(
2494 "http://localhost:1234/v1".to_string(),
2495 "strategist".to_string(),
2496 0,
2497 )
2498 .expect("engine");
2499
2500 let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2501 assert!(system.contains(crate::HEMATITE_VERSION));
2502 }
2503
2504 #[test]
2505 fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2506 let text = r#"<|channel>thought
2507Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2508<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2509
2510 let calls = extract_native_tool_calls(text);
2511 assert_eq!(calls.len(), 1);
2512 assert_eq!(calls[0].function.name, "read_file");
2513
2514 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2515 assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2516 assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2517 assert_eq!(
2518 args.get("path").and_then(|v| v.as_str()),
2519 Some("src/ui/tui.rs")
2520 );
2521
2522 let stripped = strip_native_tool_call_text(text);
2523 assert!(!stripped.contains("<|tool_call"));
2524 assert!(!stripped.contains("<tool_call|>"));
2525 }
2526
2527 #[test]
2528 fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2529 let text = r#"<|channel>thought
2530Planning.
2531<channel|><|tool_call>call:map_project{focus:<|\"|>src/<|\"|>,include_symbols:true}<tool_call|><|tool_response>thought
2532Mapped src.
2533<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2534Read main.
2535<channel|>"#;
2536
2537 let calls = extract_native_tool_calls(text);
2538 assert_eq!(calls.len(), 2);
2539 assert_eq!(calls[0].function.name, "map_project");
2540 assert_eq!(calls[1].function.name, "read_file");
2541
2542 let stripped = strip_native_tool_call_text(text);
2543 assert!(!stripped.contains("<|tool_call"));
2544 assert!(!stripped.contains("<|tool_response"));
2545 assert!(!stripped.contains("<tool_response|>"));
2546 }
2547}