1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use tokio::sync::{mpsc, Semaphore};
4
5pub use crate::agent::economics::{SessionEconomics, ToolRecord};
6
7pub struct InferenceEngine {
10 pub client: reqwest::Client,
11 pub api_url: String,
12 pub base_url: String,
15 pub species: String,
16 pub snark: u8,
17 pub kv_semaphore: Semaphore,
18 pub model: std::sync::RwLock<String>,
20 pub context_length: std::sync::atomic::AtomicUsize,
22 pub economics: std::sync::Arc<std::sync::Mutex<SessionEconomics>>,
23 pub worker_model: Option<String>,
25 pub gemma_native_formatting: std::sync::Arc<std::sync::atomic::AtomicBool>,
27 pub cancel_token: std::sync::Arc<std::sync::atomic::AtomicBool>,
29}
30
31pub fn is_gemma4_model_name(model: &str) -> bool {
32 let lower = model.to_ascii_lowercase();
33 lower.contains("gemma-4") || lower.contains("gemma4")
34}
35
36fn should_use_gemma_native_formatting(engine: &InferenceEngine, model: &str) -> bool {
37 is_gemma4_model_name(model) && engine.gemma_native_formatting_enabled()
38}
39
40#[derive(Serialize, Clone, Debug)]
43pub struct ToolDefinition {
44 #[serde(rename = "type")]
45 pub tool_type: String,
46 pub function: ToolFunction,
47 #[serde(skip_serializing, skip_deserializing)]
48 pub metadata: ToolMetadata,
49}
50
51#[derive(Serialize, Clone, Debug)]
52pub struct ToolFunction {
53 pub name: String,
54 pub description: String,
55 pub parameters: Value,
56}
57
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum ToolCategory {
60 RepoRead,
61 RepoWrite,
62 Runtime,
63 Architecture,
64 Toolchain,
65 Verification,
66 Git,
67 Research,
68 Vision,
69 Lsp,
70 Workflow,
71 External,
72 Other,
73}
74
75#[derive(Clone, Copy, Debug, PartialEq, Eq)]
76pub struct ToolMetadata {
77 pub category: ToolCategory,
78 pub mutates_workspace: bool,
79 pub external_surface: bool,
80 pub trust_sensitive: bool,
81 pub read_only_friendly: bool,
82 pub plan_scope: bool,
83}
84
85pub fn tool_metadata_for_name(name: &str) -> ToolMetadata {
86 if name.starts_with("mcp__") {
87 let lower = name.to_ascii_lowercase();
88 let mutates_workspace = [
89 "__edit",
90 "__write",
91 "__create",
92 "__move",
93 "__delete",
94 "__remove",
95 "__rename",
96 "__replace",
97 "__patch",
98 ]
99 .iter()
100 .any(|needle| lower.contains(needle));
101 return ToolMetadata {
102 category: ToolCategory::External,
103 mutates_workspace,
104 external_surface: true,
105 trust_sensitive: true,
106 read_only_friendly: !mutates_workspace,
107 plan_scope: false,
108 };
109 }
110
111 match name {
112 "read_file" | "inspect_lines" | "grep_files" | "list_files" => ToolMetadata {
113 category: ToolCategory::RepoRead,
114 mutates_workspace: false,
115 external_surface: false,
116 trust_sensitive: false,
117 read_only_friendly: true,
118 plan_scope: true,
119 },
120 "write_file" | "edit_file" | "patch_hunk" | "multi_search_replace" => ToolMetadata {
121 category: ToolCategory::RepoWrite,
122 mutates_workspace: true,
123 external_surface: false,
124 trust_sensitive: true,
125 read_only_friendly: false,
126 plan_scope: true,
127 },
128 "trace_runtime_flow" => ToolMetadata {
129 category: ToolCategory::Architecture,
130 mutates_workspace: false,
131 external_surface: false,
132 trust_sensitive: false,
133 read_only_friendly: true,
134 plan_scope: false,
135 },
136 "describe_toolchain" => ToolMetadata {
137 category: ToolCategory::Toolchain,
138 mutates_workspace: false,
139 external_surface: false,
140 trust_sensitive: false,
141 read_only_friendly: true,
142 plan_scope: false,
143 },
144 "shell" => ToolMetadata {
145 category: ToolCategory::Runtime,
146 mutates_workspace: true,
147 external_surface: false,
148 trust_sensitive: true,
149 read_only_friendly: false,
150 plan_scope: false,
151 },
152 "inspect_host" => ToolMetadata {
153 category: ToolCategory::Runtime,
154 mutates_workspace: false,
155 external_surface: false,
156 trust_sensitive: false,
157 read_only_friendly: true,
158 plan_scope: false,
159 },
160 "run_hematite_maintainer_workflow" => ToolMetadata {
161 category: ToolCategory::Workflow,
162 mutates_workspace: true,
163 external_surface: false,
164 trust_sensitive: true,
165 read_only_friendly: false,
166 plan_scope: false,
167 },
168 "run_workspace_workflow" => ToolMetadata {
169 category: ToolCategory::Workflow,
170 mutates_workspace: true,
171 external_surface: false,
172 trust_sensitive: true,
173 read_only_friendly: false,
174 plan_scope: false,
175 },
176 "verify_build" => ToolMetadata {
177 category: ToolCategory::Verification,
178 mutates_workspace: false,
179 external_surface: false,
180 trust_sensitive: false,
181 read_only_friendly: true,
182 plan_scope: false,
183 },
184 "git_commit" | "git_push" | "git_remote" | "git_onboarding" | "git_worktree" => {
185 ToolMetadata {
186 category: ToolCategory::Git,
187 mutates_workspace: true,
188 external_surface: false,
189 trust_sensitive: true,
190 read_only_friendly: false,
191 plan_scope: false,
192 }
193 }
194 "research_web" | "fetch_docs" => ToolMetadata {
195 category: ToolCategory::Research,
196 mutates_workspace: false,
197 external_surface: false,
198 trust_sensitive: false,
199 read_only_friendly: true,
200 plan_scope: false,
201 },
202 "vision_analyze" => ToolMetadata {
203 category: ToolCategory::Vision,
204 mutates_workspace: false,
205 external_surface: false,
206 trust_sensitive: false,
207 read_only_friendly: true,
208 plan_scope: false,
209 },
210 "lsp_definitions"
211 | "lsp_references"
212 | "lsp_hover"
213 | "lsp_rename_symbol"
214 | "lsp_get_diagnostics"
215 | "lsp_search_symbol" => ToolMetadata {
216 category: ToolCategory::Lsp,
217 mutates_workspace: false,
218 external_surface: false,
219 trust_sensitive: false,
220 read_only_friendly: true,
221 plan_scope: false,
222 },
223 "auto_pin_context" | "list_pinned" | "clarify" => ToolMetadata {
224 category: ToolCategory::Workflow,
225 mutates_workspace: false,
226 external_surface: false,
227 trust_sensitive: false,
228 read_only_friendly: true,
229 plan_scope: true,
230 },
231 "manage_tasks" => ToolMetadata {
232 category: ToolCategory::Workflow,
233 mutates_workspace: false,
234 external_surface: false,
235 trust_sensitive: false,
236 read_only_friendly: true,
237 plan_scope: false,
238 },
239 _ => ToolMetadata {
240 category: ToolCategory::Other,
241 mutates_workspace: false,
242 external_surface: false,
243 trust_sensitive: false,
244 read_only_friendly: true,
245 plan_scope: false,
246 },
247 }
248}
249
250#[derive(Serialize, Deserialize, Clone, Debug)]
255pub struct ChatMessage {
256 pub role: String,
257 pub content: MessageContent,
259 #[serde(default, skip_serializing_if = "Vec::is_empty")]
261 pub tool_calls: Vec<ToolCallResponse>,
262 #[serde(skip_serializing_if = "Option::is_none")]
264 pub tool_call_id: Option<String>,
265 #[serde(skip_serializing_if = "Option::is_none")]
267 pub name: Option<String>,
268}
269
270#[derive(Serialize, Deserialize, Clone, Debug)]
271#[serde(untagged)]
272pub enum MessageContent {
273 Text(String),
274 Parts(Vec<ContentPart>),
275}
276
277#[derive(Serialize, Deserialize, Clone, Debug)]
278#[serde(tag = "type")]
279pub enum ContentPart {
280 #[serde(rename = "text")]
281 Text { text: String },
282 #[serde(rename = "image_url")]
283 ImageUrl { image_url: ImageUrlSource },
284}
285
286#[derive(Serialize, Deserialize, Clone, Debug)]
287pub struct ImageUrlSource {
288 pub url: String,
289}
290
291impl Default for MessageContent {
292 fn default() -> Self {
293 MessageContent::Text(String::new())
294 }
295}
296
297impl MessageContent {
298 pub fn as_str(&self) -> &str {
299 match self {
300 MessageContent::Text(s) => s,
301 MessageContent::Parts(parts) => {
302 for part in parts {
303 if let ContentPart::Text { text } = part {
304 return text;
305 }
306 }
307 ""
308 }
309 }
310 }
311}
312
313impl ChatMessage {
314 pub fn system(content: &str) -> Self {
315 Self {
316 role: "system".into(),
317 content: MessageContent::Text(content.into()),
318 tool_calls: Vec::new(),
319 tool_call_id: None,
320 name: None,
321 }
322 }
323 pub fn user(content: &str) -> Self {
324 Self {
325 role: "user".into(),
326 content: MessageContent::Text(content.into()),
327 tool_calls: Vec::new(),
328 tool_call_id: None,
329 name: None,
330 }
331 }
332 pub fn user_with_image(text: &str, image_url: &str) -> Self {
333 let mut text_parts = text.to_string();
334 if !text_parts.contains("<|image|>") {
335 text_parts.push_str(" <|image|>");
336 }
337 Self {
338 role: "user".into(),
339 content: MessageContent::Parts(vec![
340 ContentPart::Text { text: text_parts },
341 ContentPart::ImageUrl {
342 image_url: ImageUrlSource {
343 url: image_url.into(),
344 },
345 },
346 ]),
347 tool_calls: Vec::new(),
348 tool_call_id: None,
349 name: None,
350 }
351 }
352 pub fn assistant_text(content: &str) -> Self {
353 Self {
354 role: "assistant".into(),
355 content: MessageContent::Text(content.into()),
356 tool_calls: Vec::new(),
357 tool_call_id: None,
358 name: None,
359 }
360 }
361 pub fn assistant_tool_calls(content: &str, calls: Vec<ToolCallResponse>) -> Self {
362 Self {
363 role: "assistant".into(),
364 content: MessageContent::Text(content.into()),
365 tool_calls: calls,
366 tool_call_id: None,
367 name: None,
368 }
369 }
370 pub fn tool_result(tool_call_id: &str, fn_name: &str, content: &str) -> Self {
371 Self::tool_result_for_model(tool_call_id, fn_name, content, "")
372 }
373
374 pub fn tool_result_for_model(
377 tool_call_id: &str,
378 fn_name: &str,
379 content: &str,
380 model: &str,
381 ) -> Self {
382 let body = if is_gemma4_model_name(model) {
383 format!(
384 "<|tool_response>response:{}{}{}<tool_response|>",
385 fn_name, "{", content
386 )
387 } else {
388 content.to_string()
389 };
390 Self {
391 role: "tool".into(),
392 content: MessageContent::Text(body),
393 tool_calls: Vec::new(),
394 tool_call_id: Some(tool_call_id.into()),
395 name: Some(fn_name.into()),
396 }
397 }
398}
399
400#[derive(Serialize, Deserialize, Clone, Debug)]
403pub struct ToolCallResponse {
404 pub id: String,
405 #[serde(rename = "type")]
406 pub call_type: String,
407 pub function: ToolCallFn,
408}
409
410#[derive(Serialize, Deserialize, Clone, Debug)]
411pub struct ToolCallFn {
412 pub name: String,
413 pub arguments: String,
415}
416
417#[derive(Serialize)]
420struct ChatRequest {
421 model: String,
422 messages: Vec<ChatMessage>,
423 temperature: f32,
424 stream: bool,
425 #[serde(skip_serializing_if = "Option::is_none")]
426 tools: Option<Vec<ToolDefinition>>,
427}
428
429#[derive(Deserialize, Debug)]
430struct ChatResponse {
431 choices: Vec<ResponseChoice>,
432 usage: Option<TokenUsage>,
433}
434
435#[derive(Deserialize, Debug, Clone)]
436pub struct TokenUsage {
437 pub prompt_tokens: usize,
438 pub completion_tokens: usize,
439 pub total_tokens: usize,
440 #[serde(default)]
441 pub prompt_cache_hit_tokens: usize,
442 #[serde(default)]
443 pub cache_read_input_tokens: usize,
444}
445
446#[derive(Deserialize, Debug)]
447struct ResponseChoice {
448 message: ResponseMessage,
449 #[serde(default)]
450 finish_reason: Option<String>,
451}
452
453#[derive(Deserialize, Debug)]
454struct ResponseMessage {
455 content: Option<String>,
456 tool_calls: Option<Vec<ToolCallResponse>>,
457 #[serde(default)]
461 reasoning_content: Option<String>,
462}
463
464const MIN_RESERVED_OUTPUT_TOKENS: usize = 1024;
465const MAX_RESERVED_OUTPUT_TOKENS: usize = 4096;
466
467fn is_tiny_context_window(context_length: usize) -> bool {
468 context_length <= 8_192
469}
470
471fn is_compact_context_window(context_length: usize) -> bool {
472 context_length > 8_192 && context_length <= 49_152
473}
474
475pub fn is_compact_context_window_pub(context_length: usize) -> bool {
476 is_compact_context_window(context_length)
477}
478
479fn is_provider_context_limit_detail(lower: &str) -> bool {
480 (lower.contains("n_keep") && lower.contains("n_ctx"))
481 || lower.contains("context length")
482 || lower.contains("keep from the initial prompt")
483 || lower.contains("prompt is greater than the context length")
484 || lower.contains("exceeds the context window")
485}
486
487fn classify_runtime_failure_tag(detail: &str) -> &'static str {
488 let lower = detail.to_ascii_lowercase();
489 if lower.contains("context_window_blocked")
490 || lower.contains("context ceiling reached")
491 || lower.contains("exceeds the")
492 || is_provider_context_limit_detail(&lower)
493 {
494 "context_window"
495 } else if lower.contains("empty response from model")
496 || lower.contains("model returned an empty response")
497 {
498 "empty_model_response"
499 } else if lower.contains("action blocked:")
500 || lower.contains("access denied")
501 || lower.contains("declined by user")
502 {
503 "tool_policy_blocked"
504 } else {
505 "provider_degraded"
506 }
507}
508
509fn runtime_failure_guidance(tag: &str) -> &'static str {
510 match tag {
511 "context_window" => {
512 "Narrow the request, compact the session, or preserve grounded tool output instead of restyling it. If LM Studio reports a smaller live n_ctx than Hematite expected, reload or re-detect the model budget before retrying."
513 }
514 "empty_model_response" => {
515 "Retry once automatically, then narrow the turn or restart LM Studio if the model keeps returning nothing."
516 }
517 "tool_policy_blocked" => {
518 "Stay inside the allowed workflow or switch modes before retrying."
519 }
520 _ => "Retry once automatically, then narrow the turn or restart LM Studio if it persists.",
521 }
522}
523
524fn format_runtime_failure_message(detail: &str) -> String {
525 let tag = classify_runtime_failure_tag(detail);
526 format!(
527 "[failure:{}] {} Detail: {}",
528 tag,
529 runtime_failure_guidance(tag),
530 detail.trim()
531 )
532}
533
534#[derive(Debug, Clone, Copy, PartialEq, Eq)]
535pub enum ProviderRuntimeState {
536 Booting,
537 Live,
538 Recovering,
539 Degraded,
540 ContextWindow,
541 EmptyResponse,
542}
543
544#[derive(Debug, Clone, Copy, PartialEq, Eq)]
545pub enum McpRuntimeState {
546 Unconfigured,
547 Healthy,
548 Degraded,
549 Failed,
550}
551
552#[derive(Debug, Clone, Copy, PartialEq, Eq)]
553pub enum OperatorCheckpointState {
554 Idle,
555 RecoveringProvider,
556 BudgetReduced,
557 HistoryCompacted,
558 BlockedContextWindow,
559 BlockedPolicy,
560 BlockedRecentFileEvidence,
561 BlockedExactLineWindow,
562 BlockedToolLoop,
563 BlockedVerification,
564}
565
566impl OperatorCheckpointState {
567 pub fn label(self) -> &'static str {
568 match self {
569 OperatorCheckpointState::Idle => "idle",
570 OperatorCheckpointState::RecoveringProvider => "recovering_provider",
571 OperatorCheckpointState::BudgetReduced => "budget_reduced",
572 OperatorCheckpointState::HistoryCompacted => "history_compacted",
573 OperatorCheckpointState::BlockedContextWindow => "blocked_context_window",
574 OperatorCheckpointState::BlockedPolicy => "blocked_policy",
575 OperatorCheckpointState::BlockedRecentFileEvidence => "blocked_recent_file_evidence",
576 OperatorCheckpointState::BlockedExactLineWindow => "blocked_exact_line_window",
577 OperatorCheckpointState::BlockedToolLoop => "blocked_tool_loop",
578 OperatorCheckpointState::BlockedVerification => "blocked_verification",
579 }
580 }
581}
582
583fn provider_state_for_failure_tag(tag: &str) -> ProviderRuntimeState {
584 match tag {
585 "context_window" => ProviderRuntimeState::ContextWindow,
586 "empty_model_response" => ProviderRuntimeState::EmptyResponse,
587 _ => ProviderRuntimeState::Degraded,
588 }
589}
590
591fn compact_runtime_failure_summary(tag: &str, detail: &str) -> String {
592 match tag {
593 "context_window" => {
594 "LM Studio context ceiling hit; narrow the turn or refresh the live runtime budget."
595 .to_string()
596 }
597 "empty_model_response" => {
598 "LM Studio returned an empty reply; Hematite will retry once before surfacing a failure."
599 .to_string()
600 }
601 "tool_policy_blocked" => {
602 "A blocked tool path was rejected; stay inside the allowed workflow before retrying."
603 .to_string()
604 }
605 _ => {
606 let mut excerpt = detail
607 .split_whitespace()
608 .take(12)
609 .collect::<Vec<_>>()
610 .join(" ");
611 if excerpt.len() > 110 {
612 excerpt.truncate(110);
613 excerpt.push_str("...");
614 }
615 if excerpt.is_empty() {
616 "LM Studio degraded; Hematite will retry once before surfacing a failure."
617 .to_string()
618 } else {
619 format!("LM Studio degraded: {}", excerpt)
620 }
621 }
622 }
623}
624
625#[derive(Debug)]
628pub enum InferenceEvent {
629 Token(String),
631 MutedToken(String),
633 Thought(String),
635 VoiceStatus(String),
637 ToolCallStart {
639 id: String,
640 name: String,
641 args: String,
642 },
643 ToolCallResult {
645 id: String,
646 name: String,
647 output: String,
648 is_error: bool,
649 },
650 ApprovalRequired {
654 id: String,
655 name: String,
656 display: String,
657 diff: Option<String>,
660 responder: tokio::sync::oneshot::Sender<bool>,
661 },
662 Done,
664 Error(String),
666 ProviderStatus {
668 state: ProviderRuntimeState,
669 summary: String,
670 },
671 OperatorCheckpoint {
673 state: OperatorCheckpointState,
674 summary: String,
675 },
676 RecoveryRecipe { summary: String },
678 McpStatus {
680 state: McpRuntimeState,
681 summary: String,
682 },
683 CompactionPressure {
685 estimated_tokens: usize,
686 threshold_tokens: usize,
687 percent: u8,
688 },
689 PromptPressure {
691 estimated_input_tokens: usize,
692 reserved_output_tokens: usize,
693 estimated_total_tokens: usize,
694 context_length: usize,
695 percent: u8,
696 },
697 TaskProgress {
699 id: String,
700 label: String,
701 progress: u8,
702 },
703 UsageUpdate(TokenUsage),
705 RuntimeProfile {
707 model_id: String,
708 context_length: usize,
709 },
710 VeinStatus {
712 file_count: usize,
713 embedded_count: usize,
714 docs_only: bool,
715 },
716 VeinContext { paths: Vec<String> },
719 SoulReroll {
721 species: String,
722 rarity: String,
723 shiny: bool,
724 personality: String,
725 },
726 EmbedProfile { model_id: Option<String> },
728}
729
730impl InferenceEngine {
733 pub fn new(
734 api_url: String,
735 species: String,
736 snark: u8,
737 ) -> Result<Self, Box<dyn std::error::Error>> {
738 let client = reqwest::Client::builder()
739 .timeout(std::time::Duration::from_secs(180))
740 .build()?;
741
742 let base_url = {
744 let trimmed = api_url.trim_end_matches('/');
745 if let Some(scheme_end) = trimmed.find("://") {
746 let after_scheme = &trimmed[scheme_end + 3..];
747 if let Some(path_start) = after_scheme.find('/') {
748 format!(
749 "{}://{}",
750 &trimmed[..scheme_end],
751 &after_scheme[..path_start]
752 )
753 } else {
754 trimmed.to_string()
755 }
756 } else {
757 trimmed.to_string()
758 }
759 };
760
761 let api_url = if api_url.ends_with("/chat/completions") {
762 api_url
763 } else if api_url.ends_with("/") {
764 format!("{}chat/completions", api_url)
765 } else {
766 format!("{}/chat/completions", api_url)
767 };
768
769 Ok(Self {
770 client,
771 api_url,
772 base_url,
773 species,
774 snark,
775 kv_semaphore: Semaphore::new(3),
776 model: std::sync::RwLock::new(String::new()),
777 context_length: std::sync::atomic::AtomicUsize::new(32_768), economics: std::sync::Arc::new(std::sync::Mutex::new(SessionEconomics::new())),
779 worker_model: None,
780 gemma_native_formatting: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
781 cancel_token: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)),
782 })
783 }
784
785 pub fn set_gemma_native_formatting(&self, enabled: bool) {
786 self.gemma_native_formatting
787 .store(enabled, std::sync::atomic::Ordering::SeqCst);
788 }
789
790 pub fn gemma_native_formatting_enabled(&self) -> bool {
791 self.gemma_native_formatting
792 .load(std::sync::atomic::Ordering::SeqCst)
793 }
794
795 pub fn current_model(&self) -> String {
796 self.model.read().map(|g| g.clone()).unwrap_or_default()
797 }
798
799 pub fn current_context_length(&self) -> usize {
800 self.context_length
801 .load(std::sync::atomic::Ordering::SeqCst)
802 }
803
804 pub fn set_runtime_profile(&self, model: &str, context_length: usize) {
805 if let Ok(mut guard) = self.model.write() {
806 *guard = model.to_string();
807 }
808 self.context_length
809 .store(context_length, std::sync::atomic::Ordering::SeqCst);
810 }
811
812 pub async fn health_check(&self) -> bool {
814 let url = format!("{}/v1/models", self.base_url);
815 match self.client.get(&url).send().await {
816 Ok(resp) => resp.status().is_success(),
817 Err(_) => false,
818 }
819 }
820
821 pub async fn get_loaded_model(&self) -> Option<String> {
829 #[derive(Deserialize)]
830 struct ModelList {
831 data: Vec<ModelEntry>,
832 }
833 #[derive(Deserialize)]
834 struct ModelEntry {
835 id: String,
836 #[serde(rename = "type", default)]
837 model_type: String,
838 #[serde(default)]
839 state: String,
840 }
841
842 if let Ok(resp) = self
844 .client
845 .get(format!("{}/api/v0/models", self.base_url))
846 .send()
847 .await
848 {
849 if let Ok(list) = resp.json::<ModelList>().await {
850 let chat_model = list
851 .data
852 .into_iter()
853 .find(|m| m.model_type != "embeddings" && m.state == "loaded")
854 .map(|m| m.id)
855 .unwrap_or_default();
856 return Some(chat_model);
857 }
858 }
859
860 let resp = self
862 .client
863 .get(format!("{}/v1/models", self.base_url))
864 .send()
865 .await
866 .ok()?;
867 let list: ModelList = resp.json().await.ok()?;
868 Some(
869 list.data
870 .into_iter()
871 .find(|m| !m.id.to_lowercase().contains("embed"))
872 .map(|m| m.id)
873 .unwrap_or_default(),
874 )
875 }
876
877 pub async fn get_embedding_model(&self) -> Option<String> {
883 #[derive(Deserialize)]
884 struct ModelList {
885 data: Vec<ModelEntry>,
886 }
887 #[derive(Deserialize)]
888 struct ModelEntry {
889 id: String,
890 #[serde(rename = "type", default)]
891 model_type: String,
892 #[serde(default)]
893 state: String,
894 }
895 let resp = self
896 .client
897 .get(format!("{}/api/v0/models", self.base_url))
898 .send()
899 .await
900 .ok()?;
901 let list: ModelList = resp.json().await.ok()?;
902 list.data
903 .into_iter()
904 .find(|m| m.model_type == "embeddings" && m.state == "loaded")
905 .map(|m| m.id)
906 }
907
908 pub async fn detect_context_length(&self) -> usize {
914 #[derive(Deserialize)]
915 struct LmStudioModel {
916 id: Option<String>,
917 #[serde(rename = "type", default)]
918 model_type: String,
919 state: Option<String>,
920 loaded_context_length: Option<u64>,
921 context_length: Option<u64>,
922 max_context_length: Option<u64>,
923 }
924 #[derive(Deserialize)]
925 struct LmStudioList {
926 data: Vec<LmStudioModel>,
927 }
928
929 if let Ok(resp) = self
931 .client
932 .get(format!("{}/api/v0/models", self.base_url))
933 .send()
934 .await
935 {
936 if let Ok(list) = resp.json::<LmStudioList>().await {
937 let target_model = self.current_model().to_ascii_lowercase();
938 let non_embed = |m: &&LmStudioModel| m.model_type != "embeddings";
940 let loaded = list
941 .data
942 .iter()
943 .find(|m| {
944 non_embed(m)
945 && m.state.as_deref() == Some("loaded")
946 && m.id
947 .as_deref()
948 .map(|id| id.eq_ignore_ascii_case(&target_model))
949 .unwrap_or(false)
950 })
951 .or_else(|| {
952 list.data
953 .iter()
954 .find(|m| non_embed(m) && m.state.as_deref() == Some("loaded"))
955 })
956 .or_else(|| {
957 list.data.iter().find(|m| {
958 non_embed(m)
959 && m.id
960 .as_deref()
961 .map(|id| id.eq_ignore_ascii_case(&target_model))
962 .unwrap_or(false)
963 })
964 })
965 .or_else(|| list.data.iter().find(|m| non_embed(m)));
966
967 if let Some(model) = loaded {
968 if let Some(ctx) = model.loaded_context_length {
969 if ctx > 0 {
970 return ctx as usize;
971 }
972 }
973 if let Some(ctx) = model.context_length {
974 if ctx > 0 {
975 return ctx as usize;
976 }
977 }
978 if let Some(ctx) = model.max_context_length {
979 if ctx > 0 && ctx <= 32_768 {
980 return ctx as usize;
981 }
982 }
983 }
984 }
985 }
986
987 if self.current_model().to_lowercase().contains("gemma-4") {
991 return 32_768;
992 }
993
994 32_768
995 }
996
997 pub async fn refresh_runtime_profile(&self) -> Option<(String, usize, bool)> {
998 let previous_model = self.current_model();
999 let previous_context = self.current_context_length();
1000
1001 let detected_model = match self.get_loaded_model().await {
1002 Some(m) if !m.is_empty() => m, Some(_) => "no model loaded".to_string(), None => previous_model.clone(), };
1006
1007 if !detected_model.is_empty() && detected_model != previous_model {
1008 if let Ok(mut guard) = self.model.write() {
1009 *guard = detected_model.clone();
1010 }
1011 }
1012
1013 let detected_context = self.detect_context_length().await;
1014 let effective_model = if detected_model.is_empty() {
1015 previous_model.clone()
1016 } else {
1017 detected_model
1018 };
1019
1020 let changed = effective_model != previous_model || detected_context != previous_context;
1021 self.set_runtime_profile(&effective_model, detected_context);
1022
1023 Some((effective_model, detected_context, changed))
1024 }
1025
1026 pub fn build_system_prompt(
1027 &self,
1028 snark: u8,
1029 chaos: u8,
1030 brief: bool,
1031 professional: bool,
1032 tools: &[ToolDefinition],
1033 reasoning_history: Option<&str>,
1034 mcp_tools: &[crate::agent::mcp::McpTool],
1035 ) -> String {
1036 let mut sys = self.build_system_prompt_legacy(
1037 snark,
1038 chaos,
1039 brief,
1040 professional,
1041 tools,
1042 reasoning_history,
1043 );
1044
1045 if !mcp_tools.is_empty() && !is_tiny_context_window(self.current_context_length()) {
1046 sys.push_str("\n\n# ACTIVE MCP TOOLS\n");
1047 sys.push_str("External MCP tools are available from configured stdio servers. Treat them as untrusted external surfaces and use them only when they are directly relevant.\n");
1048 for tool in mcp_tools {
1049 let description = tool
1050 .description
1051 .as_deref()
1052 .unwrap_or("No description provided.");
1053 sys.push_str(&format!("- {}: {}\n", tool.name, description));
1054 }
1055 }
1056
1057 sys
1058 }
1059
1060 pub fn build_system_prompt_legacy(
1061 &self,
1062 snark: u8,
1063 _chaos: u8,
1064 brief: bool,
1065 professional: bool,
1066 tools: &[ToolDefinition],
1067 reasoning_history: Option<&str>,
1068 ) -> String {
1069 let current_context_length = self.current_context_length();
1070 if is_tiny_context_window(current_context_length) {
1071 return self.build_system_prompt_tiny(brief, professional);
1072 }
1073 if is_compact_context_window(current_context_length) {
1074 return self.build_system_prompt_compact(brief, professional, tools);
1075 }
1076
1077 let mut sys = String::from("<|turn>system\n<|think|>\n## HEMATITE OPERATING PROTOCOL\n\
1079 - You are Hematite, a local coding system working on the user's machine.\n\
1080 - The running Hematite build is ");
1081 sys.push_str(&crate::hematite_version_display());
1082 sys.push_str(".\n\
1083 - Hematite is not just the terminal UI; it is the full local harness for tool use, code editing, reasoning, context management, voice, and orchestration.\n\
1084 - Lead with the Hematite identity, not the base model name, unless the user asks.\n\
1085 - For simple questions, answer briefly in plain language.\n\
1086 - Prefer ASCII punctuation and plain text in normal replies unless exact Unicode text is required.\n\
1087 - Do not expose internal tool names, hidden protocols, or planning jargon unless the user asks for implementation details.\n\
1088 - ALWAYS use the thought channel (`<|channel>thought ... <channel|>`) for analysis.\n\
1089 - Keep internal reasoning inside channel delimiters.\n\
1090 - Final responses must be direct, clear, and formatted in clean Markdown when formatting helps.\n\
1091 <turn|>\n\n");
1092
1093 if let Some(history) = reasoning_history {
1094 if !history.is_empty() {
1095 sys.push_str("# INTERNAL STATE (ACTIVE TURN)\n");
1096 sys.push_str(history);
1097 sys.push_str("\n\n");
1098 }
1099 }
1100
1101 if brief {
1103 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: LOW\n\
1104 - Core directive: Think efficiently. Avoid redundant internal derivation.\n\
1105 - Depth: Surface-level verification only.\n\n");
1106 } else {
1107 sys.push_str("# ADAPTIVE THOUGHT EFFICIENCY: HIGH\n\
1108 - Core directive: Think in depth when the task needs it. Explore edge cases and architectural implications.\n\
1109 - Depth: Full multi-step derivation required.\n\n");
1110 }
1111
1112 let os = std::env::consts::OS;
1114 if professional {
1115 sys.push_str(&format!(
1116 "You are Hematite, a local coding system running on {}. \
1117 The TUI is one interface layer, not your whole identity. \
1118 Be direct, practical, technically precise, and ASCII-first in ordinary prose. \
1119 Skip filler and keep the focus on the work.\n",
1120 os
1121 ));
1122 } else {
1123 sys.push_str(&format!(
1124 "You are Hematite, a [{}] local AI coding system (Snark: {}/100) running on the user's hardware on {}. \
1125 The terminal UI is only one surface of the system. \
1126 Be direct, efficient, technical, and ASCII-first in ordinary prose. \
1127 When the user asks who you are, describe Hematite as the local coding harness and agent, not merely the TUI.\n",
1128 self.species, snark, os
1129 ));
1130 }
1131
1132 let current_model = self.current_model();
1134 if !current_model.is_empty() {
1135 sys.push_str(&format!(
1136 "Loaded model: {} | Context window: {} tokens. \
1137 Calibrate response length and tool-call depth to fit within this budget.\n\n",
1138 current_model, current_context_length
1139 ));
1140 if is_gemma4_model_name(¤t_model) {
1141 sys.push_str(
1142 "Gemma 4 native note: prefer exact tool JSON with no extra prose when calling tools. \
1143 Do not wrap `path`, `extension`, or other string arguments in extra quote layers. \
1144 For `grep_files`, provide the raw regex pattern without surrounding slash delimiters.\n\n",
1145 );
1146 }
1147 } else {
1148 sys.push_str(&format!(
1149 "Context window: {} tokens. Calibrate response length to fit within this budget.\n\n",
1150 current_context_length
1151 ));
1152 }
1153
1154 let shell_desc = if cfg!(target_os = "windows") {
1156 "[EXTERNAL SHELL]: `powershell` (Windows).\n\
1157 - Use ONLY for builds, tests, or file migrations. \n\
1158 - You MUST use the `powershell` tool directly. \n\
1159 - NEVER attempt to use `bash`, `sh`, or `/dev/null` on this system. \n\n"
1160 } else {
1161 "[EXTERNAL SHELL]: `bash` (Unix).\n\
1162 - Use ONLY for builds, tests, or file migrations. \n\
1163 - NEVER wrap bash in other shells. \n\n"
1164 };
1165
1166 sys.push_str("You distinguish strictly between [INTERNAL TOOLS] and [EXTERNAL SHELL].\n\n\
1167 [INTERNAL TOOLS]: `list_files`, `grep_files`, `read_file`, `edit_file`, `write_file`.\n\
1168 - These are the ONLY way to explore and modify code. \n\
1169 - NEVER attempt to run these as shell commands (e.g. `bash $ grep_files` is FORBIDDEN).\n\n");
1170 sys.push_str(shell_desc);
1171
1172 sys.push_str("ANTI-LOOPING: If a tool returns (no output) or 'not recognized' in a shell, pivot to a different internal tool. \n\
1174 SELF-AUDIT: If you see your own command echoed back as the result, the shell failed; pivot to an internal tool immediately.\n\n");
1175
1176 if brief {
1177 sys.push_str(
1178 "BRIEF MODE: Respond in exactly ONE concise sentence unless providing code.\n\n",
1179 );
1180 }
1181
1182 if cfg!(target_os = "windows") {
1183 sys.push_str("Shell Protocol: You are running on WINDOWS. You MUST NOT use 'bash' or '/dev/null'. \
1184 You MUST use 'powershell' (pwsh) for all shell tasks. \
1185 DO NOT attempt to manipulate Linux-style paths like /dev, /etc, or /sys.\n\n");
1186 } else if cfg!(target_os = "macos") {
1187 sys.push_str(
1188 "Shell Protocol: You are running on macOS. Use 'bash' or 'zsh' for shell tasks. \
1189 Standard Unix paths apply.\n\n",
1190 );
1191 } else {
1192 sys.push_str(
1193 "Shell Protocol: You are running on Linux. Use 'bash' for shell tasks. \
1194 Standard Unix paths apply.\n\n",
1195 );
1196 }
1197
1198 sys.push_str("OUTPUT RULES:\n\
1199 1. Your internal reasoning goes in <think>...</think> blocks. Do NOT output reasoning as plain text.\n\
1200 2. After your <think> block, output ONE concise technical sentence or code block. Nothing else.\n\
1201 3. Do NOT call tools named 'thought', 'think', 'reasoning', or any meta-cognitive name. These are not tools.\n\
1202 4. NEGATIVE CONSTRAINT: Never use a string containing a dot (.), slash (/), or backslash (\\) as a tool name. Paths are NOT tools.\n\
1203 5. NEGATIVE CONSTRAINT: Never use the name of a class, struct, or module as a tool name unless it is explicitly in the tool list.\n\
1204 6. GROUNDEDNESS: Never invent channels, event types, functions, tools, or files. If a detail is not verified from the repo or tool output, say `uncertain`.\n\
1205 7. TRACE QUESTIONS: For architecture or control-flow questions, prefer verified file and function names over high-level summaries.\n\
1206 8. If `trace_runtime_flow` fully answers the runtime question, preserve its identifiers exactly. Do not restyle or rename symbols from that tool output.\n\
1207 9. For generic capability questions, answer from stable Hematite capabilities. Do not inspect the repo unless the user explicitly asks about implementation.\n\
1208 10. Never infer language support, project support, or internet capability from unrelated crates or config files.\n\
1209 11. It is fine to say Hematite itself is written in Rust when relevant, but do not imply that capability is limited to Rust projects.\n\
1210 12. For language questions, answer at the harness level: file operations, shell, build verification, language-aware tooling when available, and multi-language project work.\n\
1211 13. Prefer real programming language examples like Python, JavaScript, TypeScript, Go, and C# over file extensions when answering language questions.\n\
1212 14. For project-building questions, talk about scaffolding, implementation, builds, tests, and iteration across different stacks instead of defaulting to a Rust-only example like `cargo build`.\n\
1213 15. Never mention raw `mcp__*` tool names unless those tools are active this turn and directly relevant.\n\
1214 16. For tooling-discipline or best-tool-selection questions, prefer `describe_toolchain` over improvising the tool surface from memory.\n\
1215 17. If `describe_toolchain` fully answers the tooling question, preserve its tool names and investigation order exactly.\n\
1216 18. PROOF BEFORE ACTION: Before editing an existing file, gather recent evidence with `read_file` or `inspect_lines` on that path or keep it pinned in active context.\n\
1217 18a. GREP BEFORE READ: For files over ~200 lines, always `grep_files` for a specific pattern to find the target line range BEFORE calling `read_file`. Never read a large file top-to-bottom — use offset+limit to read only the relevant window once grep gives you the line number.\n\
1218 19. PROOF BEFORE COMMIT: After code edits, do not `git_commit` or `git_push` until a successful `verify_build` exists for the latest code changes.\n\
1219 20. RISKY SHELL DISCIPLINE: Risky `shell` calls must include a concrete `reason` argument explaining what is being verified or changed.\n\
1220 21. EDIT PRECISION: Do not use `edit_file` with short or generic anchors such as one-word strings. Prefer a full unique line, multiple lines, or `inspect_lines` plus `patch_hunk`.\n\
1221 22. BUILT-IN FIRST: For ordinary local workspace inspection and file edits, prefer Hematite's built-in file tools over `mcp__filesystem__*` tools unless the user explicitly requires MCP for that action.\n\
1222 22a. HOST INSPECTION PRIORITY: For read-only questions about installed tools, PATH entries, environment/package-manager health, grounded fix plans for common workstation failures, network state, service state, running processes, desktop items, Downloads size, listening ports, repo-health summaries, or directory/disk reports, prefer `inspect_host` over raw `shell` when it can answer directly. If the user asks how to fix a common workstation problem such as `cargo not found`, `port 3000 already in use`, or `LM Studio not reachable`, use `fix_plan` first instead of `env_doctor`, `path`, or `ports`. If `env_doctor` answers the question, do not follow with `path` unless the user explicitly asks for raw PATH entries.\n\
1223 22b. HEMATITE MAINTAINER WORKFLOW PRIORITY: When the user explicitly asks to run Hematite's own cleanup, packaging, or release scripts, prefer `run_hematite_maintainer_workflow` over raw `shell`. This tool is for Hematite's own maintainer workflows, not for arbitrary scripts in the active workspace.\n\
1224 22c. WORKSPACE WORKFLOW PRIORITY: When the user asks to run the current project's build, test, lint, fix, package scripts, just/task/make targets, local scripts, or an exact workspace command, prefer `run_workspace_workflow` over raw `shell`. This tool always runs from the locked workspace root. If no real project workspace is locked, say so and tell the user to relaunch Hematite in the target project directory.");
1225
1226 sys.push_str("\n## SCAFFOLDING PROTOCOL\n\
1228 2. ALWAYS call verify_build immediately after to confirm the project compiles/runs.\n\
1229 3. If verify_build fails, use `lsp_get_diagnostics` to find the exact line and error.\n\
1230 4. Fix all errors before declaring success.\n\n\
1231 ## PRE-FLIGHT SCOPING PROTOCOL\n\
1232 Before attempting any multi-file task or complex refactor:\n\
1233 1. Identify 1-3 core files (entry-points, central models, or types) that drive the logic.\n\
1234 2. Use `auto_pin_context` to keep those files in active context.\n\
1235 3. Only then proceed to deeper edits or research.\n\n\
1236 ## REFACTORING PROTOCOL\n\
1237 When modifying existing code or renaming symbols:\n\
1238 1. Use `lsp_rename_symbol` for all variable/function renames to ensure project-wide safety.\n\
1239 2. After any significant edit, call `lsp_get_diagnostics` on the affected files.\n\
1240 3. If errors are found, you MUST fix them. Do not wait for the user to point them out.\n\n");
1241
1242 sys.push_str(&load_instruction_files());
1244
1245 sys.push_str(&crate::memory::deep_reflect::load_recent_memories());
1247
1248 if !tools.is_empty() {
1250 sys.push_str("\n\n# NATIVE TOOL DECLARATIONS\n");
1251 for tool in tools {
1252 let schema = serde_json::to_string(&tool.function.parameters)
1253 .unwrap_or_else(|_| "{}".to_string());
1254 sys.push_str(&format!(
1255 "<|tool>declaration:{}{}{}<tool|>\n",
1256 tool.function.name, "{", schema
1257 ));
1258 sys.push_str(&format!("// {})\n", tool.function.description));
1259 }
1260 }
1261
1262 sys
1263 }
1264
1265 fn build_system_prompt_compact(
1266 &self,
1267 brief: bool,
1268 professional: bool,
1269 tools: &[ToolDefinition],
1270 ) -> String {
1271 let current_model = self.current_model();
1274 let current_context_length = self.current_context_length();
1275 let os = std::env::consts::OS;
1276
1277 let mut sys = String::from("<|turn>system\n<|think|>\n");
1278 sys.push_str(&format!(
1279 "You are Hematite {}, a local coding harness working on the user's machine.\n",
1280 crate::hematite_version_display()
1281 ));
1282 if professional {
1283 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1284 } else {
1285 sys.push_str(&format!(
1286 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1287 self.species
1288 ));
1289 }
1290 sys.push_str(&format!(
1291 "Model: {} | Context: {} tokens. Keep turns focused.\n",
1292 current_model, current_context_length
1293 ));
1294 if is_gemma4_model_name(¤t_model) {
1295 sys.push_str(
1296 "Gemma 4: use exact tool JSON. No extra prose in tool calls. \
1297 Raw regex patterns in grep_files, no slash delimiters.\n",
1298 );
1299 }
1300 if cfg!(target_os = "windows") {
1301 sys.push_str(&format!(
1302 "OS: {}. Use PowerShell for shell. Never bash or /dev/null.\n",
1303 os
1304 ));
1305 } else {
1306 sys.push_str(&format!("OS: {}. Use native Unix shell.\n", os));
1307 }
1308 if brief {
1309 sys.push_str("BRIEF MODE: one concise sentence unless code is required.\n");
1310 }
1311
1312 sys.push_str(
1313 "\nCORE RULES:\n\
1314 - Read before editing: use `read_file` or `inspect_lines` on a file before mutating it.\n\
1315 - Verify after edits: run `verify_build` after code changes, before committing.\n\
1316 - One tool at a time. Do not batch unrelated tool calls.\n\
1317 - Do not invent tool names, file paths, or symbols not confirmed by tool output.\n\
1318 - Built-in tools first: prefer `read_file`, `edit_file`, `grep_files` over MCP filesystem tools.\n\
1319 - STARTUP/UI CHANGES: read the owner file first, make one focused edit, then run `verify_build`.\n",
1320 );
1321
1322 if !tools.is_empty() {
1323 sys.push_str("\n# AVAILABLE TOOLS\n");
1324 for tool in tools {
1325 let desc: String = tool.function.description.chars().take(120).collect();
1326 sys.push_str(&format!("- {}: {}\n", tool.function.name, desc));
1327 }
1328 }
1329
1330 sys.push_str("<turn|>\n");
1331 sys
1332 }
1333
1334 fn build_system_prompt_tiny(&self, brief: bool, professional: bool) -> String {
1335 let current_model = self.current_model();
1336 let current_context_length = self.current_context_length();
1337 let os = std::env::consts::OS;
1338 let mut sys = format!(
1339 "<|turn>system\nYou are Hematite {}, a local coding harness working on the user's machine.\n",
1340 crate::hematite_version_display()
1341 );
1342 if professional {
1343 sys.push_str("Be direct, technical, concise, and ASCII-first.\n");
1344 } else {
1345 sys.push_str(&format!(
1346 "You are a [{}] local AI coding system. Be direct, concise, and technical.\n",
1347 self.species
1348 ));
1349 }
1350 if !current_model.is_empty() {
1351 sys.push_str(&format!(
1352 "Loaded model: {} | Context window: {} tokens.\n",
1353 current_model, current_context_length
1354 ));
1355 } else {
1356 sys.push_str(&format!(
1357 "Context window: {} tokens.\n",
1358 current_context_length
1359 ));
1360 }
1361 sys.push_str("Tiny-context mode is active. Keep turns short. Prefer final answers over long analysis. Only use tools when necessary.\n");
1362 sys.push_str("Use built-in workspace tools for local inspection and edits. Do not invent tools, files, channels, or symbols.\n");
1363 sys.push_str("Before editing an existing file, gather recent file evidence first. After code edits, verify before commit.\n");
1364 if cfg!(target_os = "windows") {
1365 sys.push_str(&format!(
1366 "You are running on {}. Use PowerShell for shell work. Do not assume bash or /dev/null.\n",
1367 os
1368 ));
1369 } else {
1370 sys.push_str(&format!(
1371 "You are running on {}. Use the native Unix shell conventions.\n",
1372 os
1373 ));
1374 }
1375 if brief {
1376 sys.push_str("BRIEF MODE: answer in one concise sentence unless code is required.\n");
1377 }
1378 if is_gemma4_model_name(¤t_model) {
1379 sys.push_str(
1380 "Gemma 4 note: use exact tool JSON with no extra prose when calling tools.\n",
1381 );
1382 }
1383 sys.push_str("<turn|>\n");
1384 sys
1385 }
1386
1387 pub async fn call_with_tools(
1392 &self,
1393 messages: &[ChatMessage],
1394 tools: &[ToolDefinition],
1395 model_override: Option<&str>,
1397 ) -> Result<
1398 (
1399 Option<String>,
1400 Option<Vec<ToolCallResponse>>,
1401 Option<TokenUsage>,
1402 Option<String>,
1403 ),
1404 String,
1405 > {
1406 let _permit = self
1407 .kv_semaphore
1408 .acquire()
1409 .await
1410 .map_err(|e| e.to_string())?;
1411
1412 let current_model = self.current_model();
1413 let model = model_override.unwrap_or(current_model.as_str()).to_string();
1414 let filtered_tools = if cfg!(target_os = "windows") {
1415 tools
1416 .iter()
1417 .filter(|t| t.function.name != "bash" && t.function.name != "sh")
1418 .cloned()
1419 .collect::<Vec<_>>()
1420 } else {
1421 tools.to_vec()
1422 };
1423
1424 let request_messages = if should_use_gemma_native_formatting(self, &model) {
1425 prepare_gemma_native_messages(messages)
1426 } else {
1427 messages.to_vec()
1428 };
1429
1430 const COMPACT_CORE_TOOLS: &[&str] = &[
1435 "read_file",
1436 "inspect_lines",
1437 "edit_file",
1438 "write_file",
1439 "grep_files",
1440 "list_files",
1441 "verify_build",
1442 "shell",
1443 ];
1444 let effective_tools = if is_compact_context_window(self.current_context_length()) {
1445 let core: Vec<_> = filtered_tools
1446 .iter()
1447 .filter(|t| COMPACT_CORE_TOOLS.contains(&t.function.name.as_str()))
1448 .cloned()
1449 .collect();
1450 if core.is_empty() {
1451 None
1452 } else {
1453 Some(core)
1454 }
1455 } else if filtered_tools.is_empty() {
1456 None
1457 } else {
1458 Some(filtered_tools)
1459 };
1460
1461 let request = ChatRequest {
1462 model: model.clone(),
1463 messages: request_messages,
1464 temperature: 0.2,
1465 stream: false,
1466 tools: effective_tools,
1467 };
1468
1469 preflight_chat_request(
1471 &model,
1472 &request.messages,
1473 request.tools.as_deref().unwrap_or(&[]),
1474 self.current_context_length(),
1475 )?;
1476
1477 let mut last_err = String::new();
1478 let mut response_opt: Option<reqwest::Response> = None;
1479 for attempt in 0..3u32 {
1480 match self.client.post(&self.api_url).json(&request).send().await {
1481 Ok(res) if res.status().is_success() => {
1482 response_opt = Some(res);
1483 break;
1484 }
1485 Ok(res) if res.status().as_u16() >= 500 => {
1486 last_err = format!("LM Studio error {}", res.status());
1487 }
1488 Ok(res) => {
1489 let status = res.status();
1491 let body = res.text().await.unwrap_or_default();
1492 let preview = &body[..body.len().min(300)];
1493 return Err(format!("LM Studio error {}: {}", status, preview));
1494 }
1495 Err(e) if e.is_timeout() || e.is_connect() => {
1496 last_err = format!("Request failed: {}", e);
1497 }
1498 Err(e) => return Err(format!("Request failed: {}", e)),
1499 }
1500 if attempt < 2 {
1501 let delay = std::time::Duration::from_millis(500 * (1u64 << attempt));
1502 tokio::time::sleep(delay.min(std::time::Duration::from_secs(4))).await;
1503 }
1504 }
1505 let res = response_opt
1506 .ok_or_else(|| format!("LM Studio unreachable after 3 attempts: {}", last_err))?;
1507
1508 let body: ChatResponse = res
1509 .json()
1510 .await
1511 .map_err(|e| format!("Response parse error: {}", e))?;
1512
1513 if let Some(usage) = &body.usage {
1514 let mut econ = self.economics.lock().unwrap();
1515 econ.input_tokens += usage.prompt_tokens;
1516 econ.output_tokens += usage.completion_tokens;
1517 }
1518
1519 let choice = body
1520 .choices
1521 .into_iter()
1522 .next()
1523 .ok_or_else(|| "Empty response from model".to_string())?;
1524
1525 let finish_reason = choice.finish_reason;
1526 let mut tool_calls = choice.message.tool_calls;
1527 let mut content = choice.message.content;
1528
1529 if let Some(raw_content) = &content {
1532 let native_calls = extract_native_tool_calls(raw_content);
1533 if !native_calls.is_empty() {
1534 let mut existing = tool_calls.unwrap_or_default();
1535 existing.extend(native_calls);
1536 tool_calls = Some(existing);
1537 let stripped = strip_native_tool_call_text(raw_content);
1538 content = if stripped.trim().is_empty() {
1539 None
1540 } else {
1541 Some(stripped)
1542 };
1543 }
1544 }
1545
1546 if is_gemma4_model_name(&model) {
1547 if let Some(calls) = tool_calls.as_mut() {
1548 for call in calls.iter_mut() {
1549 call.function.arguments = normalize_tool_argument_string(
1550 &call.function.name,
1551 &call.function.arguments,
1552 );
1553 }
1554 }
1555 }
1556
1557 let reasoning_text = choice.message.reasoning_content.unwrap_or_default();
1562 if tool_calls.as_ref().map(|v| v.is_empty()).unwrap_or(true)
1563 && content.as_ref().map(|s| s.trim().is_empty()).unwrap_or(true)
1564 && !reasoning_text.is_empty()
1565 {
1566 let recovered = extract_native_tool_calls(&reasoning_text);
1567 if !recovered.is_empty() {
1568 tool_calls = Some(recovered);
1569 content = None;
1571 }
1572 }
1573
1574 Ok((content, tool_calls, body.usage, finish_reason))
1575 }
1576
1577 pub async fn stream_messages(
1581 &self,
1582 messages: &[ChatMessage],
1583 tx: mpsc::Sender<InferenceEvent>,
1584 ) -> Result<(), Box<dyn std::error::Error>> {
1585 let current_model = self.current_model();
1586 let request_messages = if should_use_gemma_native_formatting(self, ¤t_model) {
1587 prepare_gemma_native_messages(messages)
1588 } else {
1589 messages
1590 .iter()
1591 .map(|m| {
1592 let mut clone = m.clone();
1593 let current_text = m.content.as_str();
1594 if !current_text.starts_with("<|turn>") {
1595 clone.content = MessageContent::Text(format!(
1596 "<|turn>{}\n{}\n<turn|>",
1597 m.role, current_text
1598 ));
1599 }
1600 clone
1601 })
1602 .collect()
1603 };
1604
1605 let request = ChatRequest {
1606 model: current_model.clone(),
1607 messages: request_messages,
1608 temperature: 0.7,
1609 stream: true,
1610 tools: None,
1611 };
1612
1613 if let Err(e) = preflight_chat_request(
1614 ¤t_model,
1615 &request.messages,
1616 &[],
1617 self.current_context_length(),
1618 ) {
1619 let tag = classify_runtime_failure_tag(&e);
1620 let _ = tx
1621 .send(InferenceEvent::ProviderStatus {
1622 state: provider_state_for_failure_tag(tag),
1623 summary: compact_runtime_failure_summary(tag, &e),
1624 })
1625 .await;
1626 let _ = tx
1627 .send(InferenceEvent::Error(format_runtime_failure_message(&e)))
1628 .await;
1629 let _ = tx.send(InferenceEvent::Done).await;
1630 return Ok(());
1631 }
1632
1633 let mut last_err = String::new();
1634 let mut response_opt: Option<reqwest::Response> = None;
1635 for attempt in 0..2u32 {
1636 match self.client.post(&self.api_url).json(&request).send().await {
1637 Ok(res) if res.status().is_success() => {
1638 response_opt = Some(res);
1639 break;
1640 }
1641 Ok(res) if res.status().as_u16() >= 500 => {
1642 last_err = format!("LM Studio error {}", res.status());
1643 }
1644 Ok(res) => {
1645 let status = res.status();
1646 let body = res.text().await.unwrap_or_default();
1647 let preview = &body[..body.len().min(300)];
1648 let detail = format!("LM Studio error {}: {}", status, preview);
1649 let tag = classify_runtime_failure_tag(&detail);
1650 let _ = tx
1651 .send(InferenceEvent::ProviderStatus {
1652 state: provider_state_for_failure_tag(tag),
1653 summary: compact_runtime_failure_summary(tag, &detail),
1654 })
1655 .await;
1656 let _ = tx
1657 .send(InferenceEvent::Error(format_runtime_failure_message(
1658 &detail,
1659 )))
1660 .await;
1661 let _ = tx.send(InferenceEvent::Done).await;
1662 return Ok(());
1663 }
1664 Err(e) if e.is_timeout() || e.is_connect() => {
1665 last_err = format!("Request failed: {}", e);
1666 }
1667 Err(e) => {
1668 let detail = format!("Request failed: {}", e);
1669 let tag = classify_runtime_failure_tag(&detail);
1670 let _ = tx
1671 .send(InferenceEvent::ProviderStatus {
1672 state: provider_state_for_failure_tag(tag),
1673 summary: compact_runtime_failure_summary(tag, &detail),
1674 })
1675 .await;
1676 let _ = tx
1677 .send(InferenceEvent::Error(format_runtime_failure_message(
1678 &detail,
1679 )))
1680 .await;
1681 let _ = tx.send(InferenceEvent::Done).await;
1682 return Ok(());
1683 }
1684 }
1685 if attempt < 1 {
1686 let _ = tx
1687 .send(InferenceEvent::ProviderStatus {
1688 state: ProviderRuntimeState::Recovering,
1689 summary: "LM Studio degraded during stream startup; retrying once.".into(),
1690 })
1691 .await;
1692 tokio::time::sleep(std::time::Duration::from_millis(500)).await;
1693 }
1694 }
1695 let Some(res) = response_opt else {
1696 let detail = format!("LM Studio unreachable after 2 attempts: {}", last_err);
1697 let tag = classify_runtime_failure_tag(&detail);
1698 let _ = tx
1699 .send(InferenceEvent::ProviderStatus {
1700 state: provider_state_for_failure_tag(tag),
1701 summary: compact_runtime_failure_summary(tag, &detail),
1702 })
1703 .await;
1704 let _ = tx
1705 .send(InferenceEvent::Error(format_runtime_failure_message(
1706 &detail,
1707 )))
1708 .await;
1709 let _ = tx.send(InferenceEvent::Done).await;
1710 return Ok(());
1711 };
1712
1713 use futures::StreamExt;
1714 let mut byte_stream = res.bytes_stream();
1715
1716 let mut line_buffer = String::new();
1719 let mut content_buffer = String::new();
1720 let mut past_think = false;
1721 let mut emitted_any_content = false;
1722 let mut emitted_live_status = false;
1723
1724 loop {
1727 let next = tokio::select! {
1728 chunk = byte_stream.next() => chunk,
1730 _ = tokio::time::sleep(std::time::Duration::from_millis(50)) => {
1731 if self.cancel_token.load(std::sync::atomic::Ordering::SeqCst) {
1732 break;
1733 }
1734 continue;
1735 }
1736 };
1737
1738 let Some(item) = next else { break };
1739
1740 let chunk = match item {
1741 Ok(chunk) => chunk,
1742 Err(e) => {
1743 let detail = format!("Request failed: {}", e);
1744 let tag = classify_runtime_failure_tag(&detail);
1745 let _ = tx
1746 .send(InferenceEvent::ProviderStatus {
1747 state: provider_state_for_failure_tag(tag),
1748 summary: compact_runtime_failure_summary(tag, &detail),
1749 })
1750 .await;
1751 let _ = tx
1752 .send(InferenceEvent::Error(format_runtime_failure_message(
1753 &detail,
1754 )))
1755 .await;
1756 let _ = tx.send(InferenceEvent::Done).await;
1757 return Ok(());
1758 }
1759 };
1760 line_buffer.push_str(&String::from_utf8_lossy(&chunk));
1761
1762 while let Some(pos) = line_buffer.find("\n\n") {
1763 let event_str = line_buffer.drain(..pos + 2).collect::<String>();
1764 let data_pos = match event_str.find("data: ") {
1765 Some(p) => p,
1766 None => continue,
1767 };
1768
1769 let data = event_str[data_pos + 6..].trim();
1770 if data == "[DONE]" {
1771 break;
1772 }
1773
1774 if let Ok(json) = serde_json::from_str::<Value>(data) {
1775 if let Some(content) = json["choices"][0]["delta"]["content"].as_str() {
1776 if content.is_empty() {
1777 continue;
1778 }
1779
1780 if !past_think {
1781 let lc = content.to_lowercase();
1782 let close = lc
1783 .find("<channel|>")
1784 .map(|i| (i, "<channel|>".len()))
1785 .or_else(|| lc.find("</think>").map(|i| (i, "</think>".len())));
1786
1787 if let Some((tag_start, tag_len)) = close {
1788 let before = &content[..tag_start];
1790 content_buffer.push_str(before);
1791 if !content_buffer.trim().is_empty() {
1792 let _ = tx
1793 .send(InferenceEvent::Thought(content_buffer.clone()))
1794 .await;
1795 emitted_any_content = true;
1796 }
1797 content_buffer.clear();
1798
1799 past_think = true;
1800 let after = content[tag_start + tag_len..].trim_start_matches('\n');
1801 content_buffer.push_str(after);
1802 } else {
1803 content_buffer.push_str(content);
1805 if content_buffer.len() > 30
1807 && (content.contains('\n') || content.contains('.'))
1808 {
1809 let _ = tx
1810 .send(InferenceEvent::Thought(content_buffer.clone()))
1811 .await;
1812 emitted_any_content = true;
1813 content_buffer.clear();
1814 }
1815 }
1816 } else {
1817 content_buffer.push_str(content);
1820 let is_boundary = content.contains(' ')
1821 || content.contains('.')
1822 || content.contains('!')
1823 || content.contains('?');
1824
1825 if content_buffer.len() > 10 && is_boundary {
1826 if !emitted_live_status {
1827 let _ = tx
1828 .send(InferenceEvent::ProviderStatus {
1829 state: ProviderRuntimeState::Live,
1830 summary: String::new(),
1831 })
1832 .await;
1833 emitted_live_status = true;
1834 }
1835 let _ =
1836 tx.send(InferenceEvent::Token(content_buffer.clone())).await;
1837 emitted_any_content = true;
1838 content_buffer.clear();
1839 }
1840 }
1841 }
1842 }
1843 }
1844 }
1845
1846 if !content_buffer.is_empty() {
1848 if past_think {
1849 if !emitted_live_status {
1850 let _ = tx
1851 .send(InferenceEvent::ProviderStatus {
1852 state: ProviderRuntimeState::Live,
1853 summary: String::new(),
1854 })
1855 .await;
1856 }
1857 let _ = tx.send(InferenceEvent::Token(content_buffer)).await;
1858 } else {
1859 let _ = tx.send(InferenceEvent::Thought(content_buffer)).await;
1860 }
1861 emitted_any_content = true;
1862 }
1863
1864 if !emitted_any_content {
1865 let _ = tx
1866 .send(InferenceEvent::ProviderStatus {
1867 state: ProviderRuntimeState::EmptyResponse,
1868 summary: compact_runtime_failure_summary(
1869 "empty_model_response",
1870 "Empty response from model",
1871 ),
1872 })
1873 .await;
1874 let _ = tx
1875 .send(InferenceEvent::Error(format_runtime_failure_message(
1876 "Empty response from model",
1877 )))
1878 .await;
1879 let _ = tx.send(InferenceEvent::Done).await;
1880 return Ok(());
1881 }
1882
1883 let _ = tx.send(InferenceEvent::Done).await;
1884 Ok(())
1885 }
1886
1887 pub async fn stream_generation(
1889 &self,
1890 prompt: &str,
1891 snark: u8,
1892 chaos: u8,
1893 brief: bool,
1894 professional: bool,
1895 tx: mpsc::Sender<InferenceEvent>,
1896 ) -> Result<(), Box<dyn std::error::Error>> {
1897 let system = self.build_system_prompt(snark, chaos, brief, professional, &[], None, &[]);
1898 let messages = vec![ChatMessage::system(&system), ChatMessage::user(prompt)];
1899 self.stream_messages(&messages, tx).await
1900 }
1901
1902 pub async fn generate_task_worker(
1906 &self,
1907 prompt: &str,
1908 professional: bool,
1909 ) -> Result<String, String> {
1910 let current_model = self.current_model();
1911 let model = self
1912 .worker_model
1913 .as_deref()
1914 .unwrap_or(current_model.as_str());
1915 self.generate_task_with_model(prompt, 0.1, professional, model)
1916 .await
1917 }
1918
1919 pub async fn generate_task(&self, prompt: &str, professional: bool) -> Result<String, String> {
1920 self.generate_task_with_temp(prompt, 0.1, professional)
1921 .await
1922 }
1923
1924 pub async fn generate_task_with_temp(
1925 &self,
1926 prompt: &str,
1927 temp: f32,
1928 professional: bool,
1929 ) -> Result<String, String> {
1930 let current_model = self.current_model();
1931 self.generate_task_with_model(prompt, temp, professional, ¤t_model)
1932 .await
1933 }
1934
1935 pub async fn generate_task_with_model(
1936 &self,
1937 prompt: &str,
1938 temp: f32,
1939 professional: bool,
1940 model: &str,
1941 ) -> Result<String, String> {
1942 let _permit = self
1943 .kv_semaphore
1944 .acquire()
1945 .await
1946 .map_err(|e| e.to_string())?;
1947
1948 let system = self.build_system_prompt(self.snark, 50, false, professional, &[], None, &[]);
1949 let request_messages = if should_use_gemma_native_formatting(self, model) {
1950 prepare_gemma_native_messages(&[
1951 ChatMessage::system(&system),
1952 ChatMessage::user(prompt),
1953 ])
1954 } else {
1955 vec![ChatMessage::system(&system), ChatMessage::user(prompt)]
1956 };
1957 let request = ChatRequest {
1958 model: model.to_string(),
1959 messages: request_messages,
1960 temperature: temp,
1961 stream: false,
1962 tools: None,
1963 };
1964
1965 preflight_chat_request(model, &request.messages, &[], self.current_context_length())?;
1966
1967 let res = self
1968 .client
1969 .post(&self.api_url)
1970 .json(&request)
1971 .send()
1972 .await
1973 .map_err(|e| format!("LM Studio request failed: {}", e))?;
1974
1975 let body: ChatResponse = res
1976 .json()
1977 .await
1978 .map_err(|e| format!("Failed to parse response: {}", e))?;
1979
1980 body.choices
1981 .first()
1982 .and_then(|c| c.message.content.clone())
1983 .ok_or_else(|| "Empty response from model".to_string())
1984 }
1985
1986 #[allow(dead_code)]
1990 pub fn snip_history(
1991 &self,
1992 turns: &[ChatMessage],
1993 max_tokens_estimate: usize,
1994 keep_recent: usize,
1995 ) -> Vec<ChatMessage> {
1996 let total_chars: usize = turns.iter().map(|m| m.content.as_str().len()).sum();
1997 if total_chars / 4 <= max_tokens_estimate {
1998 return turns.to_vec();
1999 }
2000 let keep = keep_recent.min(turns.len());
2001 let mut snipped = vec![turns[0].clone()];
2002 if turns.len() > keep + 1 {
2003 snipped.push(ChatMessage::system(&format!(
2004 "[CONTEXT SNIPPED: {} earlier turns pruned to preserve VRAM]",
2005 turns.len() - keep - 1
2006 )));
2007 snipped.extend_from_slice(&turns[turns.len() - keep..]);
2008 } else {
2009 snipped = turns.to_vec();
2010 }
2011 snipped
2012 }
2013}
2014
2015fn estimate_serialized_tokens<T: Serialize + ?Sized>(value: &T) -> usize {
2016 serde_json::to_vec(value)
2017 .ok()
2018 .map_or(0, |bytes| bytes.len() / 4 + 1)
2019}
2020
2021const IMAGE_PART_TOKEN_ESTIMATE: usize = 1024;
2022
2023fn estimate_message_tokens(message: &ChatMessage) -> usize {
2024 let content_tokens = match &message.content {
2025 MessageContent::Text(s) => s.len() / 4 + 1,
2026 MessageContent::Parts(parts) => parts
2027 .iter()
2028 .map(|part| match part {
2029 ContentPart::Text { text } => text.len() / 4 + 1,
2030 ContentPart::ImageUrl { .. } => IMAGE_PART_TOKEN_ESTIMATE,
2033 })
2034 .sum(),
2035 };
2036 let tool_tokens: usize = message
2037 .tool_calls
2038 .iter()
2039 .map(|call| (call.function.name.len() + call.function.arguments.len()) / 4 + 4)
2040 .sum();
2041 content_tokens + tool_tokens + 6
2042}
2043
2044pub fn estimate_message_batch_tokens(messages: &[ChatMessage]) -> usize {
2045 messages.iter().map(estimate_message_tokens).sum()
2046}
2047
2048fn reserved_output_tokens(context_length: usize) -> usize {
2049 let proportional = (context_length / 8).max(MIN_RESERVED_OUTPUT_TOKENS);
2050 proportional.min(MAX_RESERVED_OUTPUT_TOKENS)
2051}
2052
2053pub fn estimate_prompt_pressure(
2054 messages: &[ChatMessage],
2055 tools: &[ToolDefinition],
2056 context_length: usize,
2057) -> (usize, usize, usize, u8) {
2058 let estimated_input_tokens =
2059 estimate_message_batch_tokens(messages) + estimate_serialized_tokens(tools) + 32;
2060 let reserved_output = reserved_output_tokens(context_length);
2061 let estimated_total = estimated_input_tokens.saturating_add(reserved_output);
2062 let percent = if context_length == 0 {
2063 0
2064 } else {
2065 ((estimated_total.saturating_mul(100)) / context_length).min(100) as u8
2066 };
2067 (
2068 estimated_input_tokens,
2069 reserved_output,
2070 estimated_total,
2071 percent,
2072 )
2073}
2074
2075fn preflight_chat_request(
2076 model: &str,
2077 messages: &[ChatMessage],
2078 tools: &[ToolDefinition],
2079 context_length: usize,
2080) -> Result<(), String> {
2081 let (estimated_input_tokens, reserved_output, estimated_total, _) =
2082 estimate_prompt_pressure(messages, tools, context_length);
2083
2084 if estimated_total > context_length {
2085 return Err(format!(
2086 "context_window_blocked for {}: estimated input {} + reserved output {} = {} tokens exceeds the {}-token context window; narrow the request, compact the session, or preserve grounded tool output instead of restyling it.",
2087 model, estimated_input_tokens, reserved_output, estimated_total, context_length
2088 ));
2089 }
2090
2091 Ok(())
2092}
2093
2094fn load_instruction_files() -> String {
2098 use std::collections::hash_map::DefaultHasher;
2099 use std::collections::HashSet;
2100 use std::hash::{Hash, Hasher};
2101
2102 let Ok(cwd) = std::env::current_dir() else {
2103 return String::new();
2104 };
2105 let mut result = String::new();
2106 let mut seen: HashSet<u64> = HashSet::new();
2107 let mut total_chars: usize = 0;
2108 const MAX_TOTAL: usize = 12_000;
2109 const MAX_PER_FILE: usize = 4_000;
2110
2111 let candidates = ["CLAUDE.md", "CLAUDE.local.md", ".hematite/instructions.md"];
2112
2113 let mut dir = cwd.clone();
2114 for _ in 0..4 {
2115 for name in &candidates {
2116 let path = dir.join(name);
2117 if !path.exists() {
2118 continue;
2119 }
2120 let Ok(content) = std::fs::read_to_string(&path) else {
2121 continue;
2122 };
2123 if content.trim().is_empty() {
2124 continue;
2125 }
2126
2127 let mut hasher = DefaultHasher::new();
2128 content.hash(&mut hasher);
2129 let h = hasher.finish();
2130 if !seen.insert(h) {
2131 continue;
2132 }
2133
2134 let truncated = if content.len() > MAX_PER_FILE {
2135 format!("{}...[truncated]", &content[..MAX_PER_FILE])
2136 } else {
2137 content
2138 };
2139
2140 if total_chars + truncated.len() > MAX_TOTAL {
2141 break;
2142 }
2143 total_chars += truncated.len();
2144 result.push_str(&format!("\n--- {} ---\n{}\n", path.display(), truncated));
2145 }
2146 match dir.parent().map(|p| p.to_owned()) {
2147 Some(p) => dir = p,
2148 None => break,
2149 }
2150 }
2151
2152 if result.is_empty() {
2153 return String::new();
2154 }
2155 format!("\n\n# Project Instructions\n{}", result)
2156}
2157
2158pub fn extract_think_block(text: &str) -> Option<String> {
2159 let lower = text.to_lowercase();
2160
2161 let open_tag = "<|channel>thought";
2163 let close_tag = "<channel|>";
2164
2165 let start_pos = lower.find(open_tag)?;
2166 let content_start = start_pos + open_tag.len();
2167
2168 let close_pos = lower[content_start..]
2169 .find(close_tag)
2170 .map(|p| content_start + p)
2171 .unwrap_or(text.len());
2172
2173 let content = text[content_start..close_pos].trim();
2174 if content.is_empty() {
2175 None
2176 } else {
2177 Some(content.to_string())
2178 }
2179}
2180
2181pub fn strip_think_blocks(text: &str) -> String {
2182 let text = {
2186 let t = text.trim_start();
2187 if t.to_lowercase().starts_with("</think>") {
2188 &t[8..]
2189 } else {
2190 text
2191 }
2192 };
2193
2194 let lower = text.to_lowercase();
2195
2196 if let Some(end) = lower.find("<channel|>").map(|i| i + "<channel|>".len()) {
2198 let answer = text[end..]
2199 .replace("<|channel>thought", "")
2200 .replace("<channel|>", "");
2201 return answer.trim().replace("\n\n\n", "\n\n").to_string();
2202 }
2203
2204 let first_open = [
2206 lower.find("<|channel>thought"), lower.find("<think>"),
2208 lower.find("<thought>"),
2209 lower.find("<|think|>"),
2210 ]
2211 .iter()
2212 .filter_map(|&x| x)
2213 .min();
2214
2215 if let Some(start) = first_open {
2216 if start > 0 {
2217 return text[..start].trim().replace("\n\n\n", "\n\n").to_string();
2218 }
2219 return String::new();
2220 }
2221
2222 let naked_reasoning_phrases: &[&str] = &[
2226 "the user asked",
2227 "the user is asking",
2228 "the user wants",
2229 "i will structure",
2230 "i should provide",
2231 "i should give",
2232 "i should avoid",
2233 "i should note",
2234 "i should focus",
2235 "i should keep",
2236 "i should respond",
2237 "i should present",
2238 "i should display",
2239 "i should show",
2240 "i need to",
2241 "i can see from",
2242 "without being overly",
2243 "let me ",
2244 "necessary information in my identity",
2245 "was computed successfully",
2246 "computed successfully",
2247 ];
2248 let is_naked_reasoning = naked_reasoning_phrases.iter().any(|p| lower.contains(p));
2249 if is_naked_reasoning {
2250 let lines: Vec<&str> = text.lines().collect();
2251 if !lines.is_empty() {
2252 let mut start_idx = 0;
2255 for (i, line) in lines.iter().enumerate() {
2256 let l = line.to_lowercase();
2257 let is_reasoning_line =
2258 naked_reasoning_phrases.iter().any(|p| l.contains(p)) || l.trim().is_empty();
2259 if is_reasoning_line {
2260 start_idx = i + 1;
2261 } else {
2262 break;
2263 }
2264 }
2265 if start_idx < lines.len() {
2266 return lines[start_idx..]
2267 .join("\n")
2268 .trim()
2269 .replace("\n\n\n", "\n\n")
2270 .to_string();
2271 }
2272 return String::new();
2274 }
2275 }
2276
2277 let cleaned = strip_xml_tool_call_artifacts(text);
2280 cleaned.trim().replace("\n\n\n", "\n\n").to_string()
2281}
2282
2283fn strip_xml_tool_call_artifacts(text: &str) -> String {
2286 const XML_ARTIFACTS: &[&str] = &[
2288 "</tool_call>",
2289 "<tool_call>",
2290 "</function>",
2291 "<function>",
2292 "</parameter>",
2293 "<parameter>",
2294 "</arguments>",
2295 "<arguments>",
2296 "</tool_use>",
2297 "<tool_use>",
2298 "</invoke>",
2299 "<invoke>",
2300 "</think>",
2302 "</thought>",
2303 "</thinking>",
2304 ];
2305 let mut out = text.to_string();
2306 for tag in XML_ARTIFACTS {
2307 while let Some(pos) = out.to_lowercase().find(&tag.to_lowercase()) {
2309 out.drain(pos..pos + tag.len());
2310 }
2311 }
2312 out
2314}
2315
2316pub fn extract_native_tool_calls(text: &str) -> Vec<ToolCallResponse> {
2319 use regex::Regex;
2320 let mut results = Vec::new();
2321
2322 let re_call = Regex::new(
2328 r#"(?s)<\|?tool_call\|?>\s*call:([A-Za-z_][A-Za-z0-9_]*)\{(.*?)\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2329 ).unwrap();
2330 let re_arg = Regex::new(r#"(\w+):(?:<\|"\|>(.*?)<\|"\|>|([^,}]*))"#).unwrap();
2333
2334 for cap in re_call.captures_iter(text) {
2335 let name = cap[1].to_string();
2336 let args_str = &cap[2];
2337 let mut arguments = serde_json::Map::new();
2338
2339 for arg_cap in re_arg.captures_iter(args_str) {
2340 let key = arg_cap[1].to_string();
2341 let val_raw = arg_cap
2343 .get(2)
2344 .map(|m| m.as_str())
2345 .or_else(|| arg_cap.get(3).map(|m| m.as_str()))
2346 .unwrap_or("")
2347 .trim();
2348 let normalized_raw = normalize_string_arg(&val_raw.replace("\\\"", "\""));
2349
2350 let val = if normalized_raw == "true" {
2352 Value::Bool(true)
2353 } else if normalized_raw == "false" {
2354 Value::Bool(false)
2355 } else if let Ok(n) = normalized_raw.parse::<i64>() {
2356 Value::Number(n.into())
2357 } else if let Ok(n) = normalized_raw.parse::<u64>() {
2358 Value::Number(n.into())
2359 } else if let Ok(n) = normalized_raw.parse::<f64>() {
2360 serde_json::Number::from_f64(n)
2361 .map(Value::Number)
2362 .unwrap_or(Value::String(normalized_raw.clone()))
2363 } else {
2364 Value::String(normalized_raw)
2365 };
2366
2367 arguments.insert(key, val);
2368 }
2369
2370 results.push(ToolCallResponse {
2371 id: format!("call_{}", rand::random::<u32>()),
2372 call_type: "function".to_string(),
2373 function: ToolCallFn {
2374 name,
2375 arguments: Value::Object(arguments).to_string(),
2376 },
2377 });
2378 }
2379
2380 results
2381}
2382
2383pub fn normalize_tool_argument_string(tool_name: &str, raw: &str) -> String {
2384 let trimmed = raw.trim();
2385 let candidate = unwrap_json_string_once(trimmed).unwrap_or_else(|| trimmed.to_string());
2386
2387 let mut value = match serde_json::from_str::<Value>(&candidate) {
2388 Ok(v) => v,
2389 Err(_) => return candidate,
2390 };
2391 normalize_tool_argument_value(tool_name, &mut value);
2392 value.to_string()
2393}
2394
2395fn normalize_tool_argument_value(tool_name: &str, value: &mut Value) {
2396 match value {
2397 Value::String(s) => *s = normalize_string_arg(s),
2398 Value::Array(items) => {
2399 for item in items {
2400 normalize_tool_argument_value(tool_name, item);
2401 }
2402 }
2403 Value::Object(map) => {
2404 for val in map.values_mut() {
2405 normalize_tool_argument_value(tool_name, val);
2406 }
2407 if tool_name == "grep_files" {
2408 if let Some(Value::String(pattern)) = map.get_mut("pattern") {
2409 *pattern = normalize_regex_pattern(pattern);
2410 }
2411 }
2412 for key in ["path", "extension", "query", "command", "reason"] {
2413 if let Some(Value::String(s)) = map.get_mut(key) {
2414 *s = normalize_string_arg(s);
2415 }
2416 }
2417 }
2418 _ => {}
2419 }
2420}
2421
2422fn unwrap_json_string_once(input: &str) -> Option<String> {
2423 if input.len() < 2 {
2424 return None;
2425 }
2426 let first = input.chars().next()?;
2427 let last = input.chars().last()?;
2428 if !matches!((first, last), ('"', '"') | ('\'', '\'') | ('`', '`')) {
2429 return None;
2430 }
2431 let inner = &input[1..input.len() - 1];
2432 let unescaped = inner.replace("\\\"", "\"").replace("\\\\", "\\");
2433 Some(unescaped.trim().to_string())
2434}
2435
2436fn normalize_string_arg(input: &str) -> String {
2437 let mut out = input.trim().to_string();
2438 while out.len() >= 2 {
2439 let mut changed = false;
2440 for (start, end) in [("\"", "\""), ("'", "'"), ("`", "`")] {
2441 if out.starts_with(start) && out.ends_with(end) {
2442 out = out[start.len()..out.len() - end.len()].trim().to_string();
2443 changed = true;
2444 break;
2445 }
2446 }
2447 if !changed {
2448 break;
2449 }
2450 }
2451 out
2452}
2453
2454fn normalize_regex_pattern(input: &str) -> String {
2455 let out = normalize_string_arg(input);
2456 if out.len() >= 2 && out.starts_with('/') && out.ends_with('/') {
2457 out[1..out.len() - 1].to_string()
2458 } else {
2459 out
2460 }
2461}
2462
2463fn prepare_gemma_native_messages(messages: &[ChatMessage]) -> Vec<ChatMessage> {
2464 let mut system_blocks = Vec::new();
2465 let mut prepared = Vec::new();
2466 let mut seeded = false;
2467
2468 for message in messages {
2469 if message.role == "system" {
2470 let cleaned = strip_legacy_turn_wrappers(message.content.as_str())
2471 .trim()
2472 .to_string();
2473 if !cleaned.is_empty() {
2474 system_blocks.push(cleaned);
2475 }
2476 continue;
2477 }
2478
2479 let mut clone = message.clone();
2480 clone.content = MessageContent::Text(strip_legacy_turn_wrappers(message.content.as_str()));
2481
2482 if !seeded && message.role == "user" {
2483 let mut merged = String::new();
2484 if !system_blocks.is_empty() {
2485 merged.push_str("System instructions for this turn:\n");
2486 merged.push_str(&system_blocks.join("\n\n"));
2487 merged.push_str("\n\n");
2488 }
2489 merged.push_str(clone.content.as_str());
2490 clone.content = MessageContent::Text(merged);
2491 seeded = true;
2492 }
2493
2494 prepared.push(clone);
2495 }
2496
2497 if !seeded && !system_blocks.is_empty() {
2498 prepared.insert(
2499 0,
2500 ChatMessage::user(&format!(
2501 "System instructions for this turn:\n{}",
2502 system_blocks.join("\n\n")
2503 )),
2504 );
2505 }
2506
2507 prepared
2508}
2509
2510fn strip_legacy_turn_wrappers(text: &str) -> String {
2511 text.replace("<|turn>system\n", "")
2512 .replace("<|turn>user\n", "")
2513 .replace("<|turn>assistant\n", "")
2514 .replace("<|turn>tool\n", "")
2515 .replace("<turn|>", "")
2516 .trim()
2517 .to_string()
2518}
2519
2520pub fn strip_native_tool_call_text(text: &str) -> String {
2521 use regex::Regex;
2522 let re_call = Regex::new(
2523 r#"(?s)<\|?tool_call\|?>\s*call:[A-Za-z_][A-Za-z0-9_]*\{.*?\}(?:<\|?tool_call\|?>|\[END_TOOL_REQUEST\])"#
2524 ).unwrap();
2525 let re_response =
2526 Regex::new(r#"(?s)<\|tool_response\|?>.*?(?:<\|tool_response\|?>|<tool_response\|>)"#)
2527 .unwrap();
2528 let without_calls = re_call.replace_all(text, "");
2529 re_response
2530 .replace_all(without_calls.as_ref(), "")
2531 .trim()
2532 .to_string()
2533}
2534
2535#[cfg(test)]
2536mod tests {
2537 use super::*;
2538
2539 #[test]
2540 fn system_prompt_includes_running_hematite_version() {
2541 let engine = InferenceEngine::new(
2542 "http://localhost:1234/v1".to_string(),
2543 "strategist".to_string(),
2544 0,
2545 )
2546 .expect("engine");
2547
2548 let system = engine.build_system_prompt(0, 50, false, true, &[], None, &[]);
2549 assert!(system.contains(crate::HEMATITE_VERSION));
2550 }
2551
2552 #[test]
2553 fn extracts_gemma_native_tool_call_with_mixed_tool_call_tags() {
2554 let text = r#"<|channel>thought
2555Reading the next chunk.<channel|>The startup banner wording is likely defined within the UI drawing logic.
2556<|tool_call>call:read_file{limit:100,offset:100,path:\"src/ui/tui.rs\"}<tool_call|>"#;
2557
2558 let calls = extract_native_tool_calls(text);
2559 assert_eq!(calls.len(), 1);
2560 assert_eq!(calls[0].function.name, "read_file");
2561
2562 let args: Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
2563 assert_eq!(args.get("limit").and_then(|v| v.as_i64()), Some(100));
2564 assert_eq!(args.get("offset").and_then(|v| v.as_i64()), Some(100));
2565 assert_eq!(
2566 args.get("path").and_then(|v| v.as_str()),
2567 Some("src/ui/tui.rs")
2568 );
2569
2570 let stripped = strip_native_tool_call_text(text);
2571 assert!(!stripped.contains("<|tool_call"));
2572 assert!(!stripped.contains("<tool_call|>"));
2573 }
2574
2575 #[test]
2576 fn strips_hallucinated_tool_responses_from_native_tool_transcript() {
2577 let text = r#"<|channel>thought
2578Planning.
2579<channel|><|tool_call>call:list_files{extension:<|\"|>rs<|\"|>,path:<|\"|>src/<|\"|>}<tool_call|><|tool_response>thought
2580Mapped src.
2581<channel|><|tool_call>call:read_file{limit:100,offset:0,path:<|\"|>src/main.rs<|\"|>}<tool_call|><|tool_response>thought
2582Read main.
2583<channel|>"#;
2584
2585 let calls = extract_native_tool_calls(text);
2586 assert_eq!(calls.len(), 2);
2587 assert_eq!(calls[0].function.name, "list_files");
2588 assert_eq!(calls[1].function.name, "read_file");
2589
2590 let stripped = strip_native_tool_call_text(text);
2591 assert!(!stripped.contains("<|tool_call"));
2592 assert!(!stripped.contains("<|tool_response"));
2593 assert!(!stripped.contains("<tool_response|>"));
2594 }
2595}