Skip to main content

harn_vm/llm/
local_profiles.rs

1//! Data-driven local runtime risk profiles.
2//!
3//! This layer keeps local model selection explicit: the model family, runtime,
4//! known risks, required probes, and workarounds are all table data that CLI
5//! lifecycle commands can explain and enforce.
6
7use std::collections::{BTreeMap, BTreeSet};
8
9use serde::{Deserialize, Serialize};
10
11use super::tool_conformance::{report_satisfies_required_probe, ToolConformanceReport};
12use crate::llm_config::{self, LocalMemoryDef, ModelDef};
13
14#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
15#[serde(rename_all = "snake_case")]
16pub enum RuntimeProfileStatus {
17    Preferred,
18    Experimental,
19    VisionOnlyExperimental,
20    Quarantined,
21    Unknown,
22}
23
24impl RuntimeProfileStatus {
25    pub fn as_str(&self) -> &'static str {
26        match self {
27            Self::Preferred => "preferred",
28            Self::Experimental => "experimental",
29            Self::VisionOnlyExperimental => "vision_only_experimental",
30            Self::Quarantined => "quarantined",
31            Self::Unknown => "unknown",
32        }
33    }
34
35    pub fn requires_probe_gate(&self) -> bool {
36        !matches!(self, Self::Preferred | Self::Unknown)
37    }
38}
39
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct RuntimeProfile {
42    pub status: RuntimeProfileStatus,
43    pub requires: Vec<String>,
44    pub recommended_num_ctx: Option<u64>,
45    pub known_risks: Vec<String>,
46    pub workarounds: Vec<String>,
47    pub notes: Vec<String>,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct LocalRuntimeProfileReport {
52    pub alias: Option<String>,
53    pub model_id: String,
54    pub provider: String,
55    pub model_family: String,
56    pub selected_runtime: String,
57    pub selected_status: RuntimeProfileStatus,
58    pub requires_probe_gate: bool,
59    pub selected: RuntimeProfile,
60    pub runtime_profiles: BTreeMap<String, RuntimeProfile>,
61}
62
63#[derive(Debug, Clone, Copy, Default, PartialEq)]
64pub struct RuntimeProfileHost {
65    pub system_available_gib: Option<f64>,
66    pub accelerator_free_gib: Option<f64>,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct RuntimeProfileGate {
71    pub allowed: bool,
72    pub forced: bool,
73    pub selected_status: RuntimeProfileStatus,
74    pub missing_required_probes: Vec<String>,
75    pub passed_probes: Vec<String>,
76    pub message: String,
77}
78
79#[derive(Debug, Clone, Default)]
80pub struct RuntimeProbeEvidence {
81    passed: BTreeSet<String>,
82    tool_reports: Vec<ToolConformanceReport>,
83}
84
85impl RuntimeProbeEvidence {
86    pub fn new() -> Self {
87        Self::default()
88    }
89
90    pub fn add_passed(&mut self, probe: impl Into<String>) {
91        let probe = probe.into();
92        if !probe.trim().is_empty() {
93            self.passed.insert(probe);
94        }
95    }
96
97    pub fn add_tool_report(&mut self, report: ToolConformanceReport) {
98        if report_satisfies_required_probe(&report, "tool_probe") {
99            self.passed.insert("tool_probe".to_string());
100            self.passed.insert("tool_call_probe".to_string());
101        }
102        if report_satisfies_required_probe(&report, "native_tool_probe") {
103            self.passed.insert("native_tool_probe".to_string());
104        }
105        if report_satisfies_required_probe(&report, "streaming_tool_probe") {
106            self.passed.insert("streaming_tool_probe".to_string());
107        }
108        self.tool_reports.push(report);
109    }
110
111    pub fn passed(&self) -> Vec<String> {
112        self.passed.iter().cloned().collect()
113    }
114
115    fn satisfies(&self, requirement: &str) -> bool {
116        self.passed.contains(requirement)
117            || self
118                .tool_reports
119                .iter()
120                .any(|report| report_satisfies_required_probe(report, requirement))
121    }
122}
123
124pub fn local_runtime_profile_report(
125    selector: &str,
126    provider_override: Option<&str>,
127) -> LocalRuntimeProfileReport {
128    let resolved = llm_config::resolve_model_info(selector);
129    let provider = provider_override
130        .map(str::trim)
131        .filter(|provider| !provider.is_empty())
132        .map(str::to_string)
133        .unwrap_or_else(|| resolved.provider.clone());
134    local_runtime_profile_report_for(resolved.alias.as_deref(), &resolved.id, &provider)
135}
136
137pub fn local_runtime_profile_report_for(
138    alias: Option<&str>,
139    model_id: &str,
140    provider: &str,
141) -> LocalRuntimeProfileReport {
142    local_runtime_profile_report_for_host(alias, model_id, provider, None)
143}
144
145pub fn local_runtime_profile_report_for_host(
146    alias: Option<&str>,
147    model_id: &str,
148    provider: &str,
149    host: Option<RuntimeProfileHost>,
150) -> LocalRuntimeProfileReport {
151    let family = model_family(alias, model_id);
152    let catalog_model = llm_config::model_catalog_entry(model_id);
153    let runtime_profiles = profiles_for_family(family)
154        .into_iter()
155        .map(|(runtime, profile)| {
156            let adjusted = adjust_profile_for_host(
157                family,
158                &runtime,
159                catalog_model.as_ref(),
160                profile,
161                host.as_ref(),
162            );
163            (runtime, adjusted)
164        })
165        .collect::<BTreeMap<_, _>>();
166    let selected = runtime_profiles
167        .get(provider)
168        .cloned()
169        .unwrap_or_else(|| generic_profile(provider));
170    LocalRuntimeProfileReport {
171        alias: alias.map(str::to_string),
172        model_id: model_id.to_string(),
173        provider: provider.to_string(),
174        model_family: family.to_string(),
175        selected_runtime: provider.to_string(),
176        selected_status: selected.status.clone(),
177        requires_probe_gate: selected.status.requires_probe_gate(),
178        selected,
179        runtime_profiles,
180    }
181}
182
183pub fn evaluate_runtime_profile_gate(
184    report: &LocalRuntimeProfileReport,
185    evidence: &RuntimeProbeEvidence,
186    force: bool,
187) -> RuntimeProfileGate {
188    let missing: Vec<String> = if report.selected_status.requires_probe_gate() {
189        report
190            .selected
191            .requires
192            .iter()
193            .filter(|requirement| !evidence.satisfies(requirement))
194            .cloned()
195            .collect()
196    } else {
197        Vec::new()
198    };
199    let allowed = force || missing.is_empty();
200    let message = if force {
201        format!(
202            "{} via {} is {} but allowed by --force",
203            report.model_id,
204            report.provider,
205            report.selected_status.as_str()
206        )
207    } else if allowed {
208        format!(
209            "{} via {} is {}",
210            report.model_id,
211            report.provider,
212            report.selected_status.as_str()
213        )
214    } else {
215        format!(
216            "{} via {} is {}; required probes missing: {}",
217            report.model_id,
218            report.provider,
219            report.selected_status.as_str(),
220            missing.join(", ")
221        )
222    };
223    RuntimeProfileGate {
224        allowed,
225        forced: force,
226        selected_status: report.selected_status.clone(),
227        missing_required_probes: missing,
228        passed_probes: evidence.passed(),
229        message,
230    }
231}
232
233fn model_family<'a>(alias: Option<&'a str>, model_id: &'a str) -> &'static str {
234    let haystack = format!(
235        "{} {}",
236        alias.unwrap_or_default().to_ascii_lowercase(),
237        model_id.to_ascii_lowercase()
238    );
239    if haystack.contains("qwen3.6") || haystack.contains("qwen36") {
240        "qwen3.6-a3b-hybrid"
241    } else if haystack.contains("gemma4") || haystack.contains("gemma-4") {
242        "gemma4-hybrid-moe"
243    } else {
244        "generic-local"
245    }
246}
247
248fn profiles_for_family(family: &str) -> BTreeMap<String, RuntimeProfile> {
249    match family {
250        "qwen3.6-a3b-hybrid" => BTreeMap::from([
251            (
252                "ollama".to_string(),
253                profile(
254                    RuntimeProfileStatus::Preferred,
255                    &["tool_probe", "effective_context_probe"],
256                    Some(32_768),
257                    &[],
258                    &[
259                        "Use the text tool wire format unless a fresh native probe passes.",
260                        "Keep an explicit num_ctx so the resident runner matches eval settings.",
261                    ],
262                    &["Best cheap local default on the 2026-05-13 Burin eval pass."],
263                ),
264            ),
265            (
266                "llamacpp".to_string(),
267                profile(
268                    RuntimeProfileStatus::Experimental,
269                    &["tool_probe"],
270                    Some(65_536),
271                    &["inflated_input_token_accounting_on_repeated_turns"],
272                    &[
273                        "Run a tool probe before write-heavy evals.",
274                        "Record llama.cpp build, ctx, cache type, and prefix-cache telemetry in eval receipts.",
275                    ],
276                    &[
277                        "Current llama.cpp builds reuse two-turn Qwen3.6 hybrid-cache prefixes; keep token accounting visible in receipts.",
278                    ],
279                ),
280            ),
281            (
282                "mlx".to_string(),
283                profile(
284                    RuntimeProfileStatus::VisionOnlyExperimental,
285                    &[
286                        "served_model_identity_probe",
287                        "persistent_readiness_probe",
288                        "tool_probe",
289                    ],
290                    None,
291                    &[
292                        "stale_or_default_v1_models_identity",
293                        "hybrid_prefix_cache_reuse_gap",
294                    ],
295                    &[
296                        "Probe /v1/models twice and send one minimal chat request before selection.",
297                        "Record server flags for APC, context length, batching, and thinking mode.",
298                    ],
299                    &["Use only when MLX-specific throughput or vision support is needed."],
300                ),
301            ),
302        ]),
303        "gemma4-hybrid-moe" => BTreeMap::from([
304            (
305                "ollama".to_string(),
306                profile(
307                    RuntimeProfileStatus::Quarantined,
308                    &["tool_probe"],
309                    Some(32_768),
310                    &[
311                        "raw_tool_tag_no_structured_calls",
312                        "completion_prose_without_executable_tool_calls",
313                    ],
314                    &[
315                        "Allow only after the one-tool probe returns native or parseable text calls.",
316                        "Use text mode and corrective retry for write-required turns.",
317                    ],
318                    &[
319                        "Gemma4 through Ollama has produced raw <tool_call> blocks and final prose in local evals.",
320                    ],
321                ),
322            ),
323            (
324                "llamacpp".to_string(),
325                profile(
326                    RuntimeProfileStatus::Experimental,
327                    &["tool_probe", "two_turn_cache_probe"],
328                    Some(32_768),
329                    &[
330                        "full_prompt_reprocess_on_hybrid_cache",
331                        "parser_template_drift",
332                    ],
333                    &[
334                        "Confirm the served template emits parseable calls before any write eval.",
335                        "Treat final prose as insufficient when artifacts are unchanged.",
336                    ],
337                    &["Prefer as an eval candidate, not a default editing runtime."],
338                ),
339            ),
340            (
341                "mlx".to_string(),
342                profile(
343                    RuntimeProfileStatus::Experimental,
344                    &[
345                        "served_model_identity_probe",
346                        "persistent_readiness_probe",
347                        "tool_probe",
348                    ],
349                    None,
350                    &[
351                        "raw_gemma_tool_markers_in_content",
352                        "hybrid_prefix_cache_reuse_gap",
353                    ],
354                    &[
355                        "Keep raw marker parser fixtures enabled in the Harn text parser.",
356                        "Verify OpenAI-compatible tool_calls is non-empty before native mode.",
357                    ],
358                    &["Use explicit server flags instead of opaque defaults."],
359                ),
360            ),
361            (
362                "local".to_string(),
363                profile(
364                    RuntimeProfileStatus::Experimental,
365                    &["tool_probe"],
366                    Some(32_768),
367                    &["provider_specific_parser_required"],
368                    &["Prefer text mode until native parser support is proven."],
369                    &["Generic local Gemma endpoints vary by serving stack."],
370                ),
371            ),
372        ]),
373        _ => BTreeMap::new(),
374    }
375}
376
377fn adjust_profile_for_host(
378    family: &str,
379    runtime: &str,
380    model: Option<&ModelDef>,
381    mut profile: RuntimeProfile,
382    host: Option<&RuntimeProfileHost>,
383) -> RuntimeProfile {
384    if family == "qwen3.6-a3b-hybrid" && runtime == "llamacpp" {
385        if let (Some(model), Some(host)) = (model, host) {
386            if let Some(ctx) = recommended_context_from_local_memory(model, host) {
387                profile.recommended_num_ctx = Some(ctx);
388            }
389        }
390    }
391    profile
392}
393
394fn recommended_context_from_local_memory(
395    model: &ModelDef,
396    host: &RuntimeProfileHost,
397) -> Option<u64> {
398    let memory = model.local_memory.as_ref()?;
399    let available_gib = host
400        .accelerator_free_gib
401        .or(host.system_available_gib)
402        .filter(|available| *available > 0.0)?;
403    let base = memory.base_resident_gib?;
404    let kv_per_1k = scaled_kv_cache_gib_per_1k(memory)?;
405    let safety = memory.safety_margin_gib.unwrap_or(4.0);
406    let usable_for_kv = available_gib - base - safety;
407    if usable_for_kv <= 0.0 {
408        return Some(8_192);
409    }
410
411    let by_memory = ((usable_for_kv / kv_per_1k) * 1_000.0).floor() as u64;
412    let ceiling = memory
413        .max_recommended_context
414        .or(model.runtime_context_window)
415        .unwrap_or(model.context_window)
416        .min(model.context_window);
417    let floor = 65_536_u64.min(ceiling).min(model.context_window);
418    Some(round_context_down(by_memory.min(ceiling).max(floor)))
419}
420
421fn scaled_kv_cache_gib_per_1k(memory: &LocalMemoryDef) -> Option<f64> {
422    let base = memory.kv_cache_gib_per_1k_ctx?;
423    let multiplier = memory
424        .default_cache_type
425        .as_ref()
426        .and_then(|cache_type| memory.cache_type_multipliers.get(cache_type))
427        .copied()
428        .unwrap_or(1.0);
429    let scaled = base * multiplier;
430    (scaled > 0.0).then_some(scaled)
431}
432
433fn round_context_down(ctx: u64) -> u64 {
434    const STEP: u64 = 8_192;
435    (ctx / STEP).max(1) * STEP
436}
437
438fn generic_profile(provider: &str) -> RuntimeProfile {
439    RuntimeProfile {
440        status: RuntimeProfileStatus::Unknown,
441        requires: vec!["readiness_probe".to_string()],
442        recommended_num_ctx: None,
443        known_risks: Vec::new(),
444        workarounds: Vec::new(),
445        notes: vec![format!(
446            "No dedicated local runtime profile for provider `{provider}` and this model family."
447        )],
448    }
449}
450
451fn profile(
452    status: RuntimeProfileStatus,
453    requires: &[&str],
454    recommended_num_ctx: Option<u64>,
455    known_risks: &[&str],
456    workarounds: &[&str],
457    notes: &[&str],
458) -> RuntimeProfile {
459    RuntimeProfile {
460        status,
461        requires: requires.iter().map(|value| (*value).to_string()).collect(),
462        recommended_num_ctx,
463        known_risks: known_risks
464            .iter()
465            .map(|value| (*value).to_string())
466            .collect(),
467        workarounds: workarounds
468            .iter()
469            .map(|value| (*value).to_string())
470            .collect(),
471        notes: notes.iter().map(|value| (*value).to_string()).collect(),
472    }
473}
474
475#[cfg(test)]
476mod tests {
477    use super::*;
478    use crate::llm::tool_conformance::{classify_tool_conformance_fixture, ToolProbeMode};
479
480    #[test]
481    fn qwen_ollama_profile_is_preferred_and_llamacpp_is_experimental() {
482        let ollama = local_runtime_profile_report("local-qwen3.6", Some("ollama"));
483        assert_eq!(ollama.model_family, "qwen3.6-a3b-hybrid");
484        assert_eq!(ollama.selected_status, RuntimeProfileStatus::Preferred);
485
486        let llamacpp = local_runtime_profile_report("local-qwen3.6", Some("llamacpp"));
487        assert_eq!(llamacpp.selected_status, RuntimeProfileStatus::Experimental);
488        assert_eq!(llamacpp.selected.requires, vec!["tool_probe".to_string()]);
489        assert!(!llamacpp
490            .selected
491            .known_risks
492            .contains(&"full_prompt_reprocess_on_hybrid_cache".to_string()));
493    }
494
495    #[test]
496    fn qwen_llamacpp_profile_raises_context_when_accelerator_memory_fits() {
497        let report = local_runtime_profile_report_for_host(
498            Some("local-qwen3.6"),
499            "qwen3.6-35b-a3b-ud-q4-k-xl",
500            "llamacpp",
501            Some(RuntimeProfileHost {
502                system_available_gib: None,
503                accelerator_free_gib: Some(32.0),
504            }),
505        );
506        assert_eq!(report.selected.recommended_num_ctx, Some(262_144));
507    }
508
509    #[test]
510    fn qwen_llamacpp_profile_keeps_conservative_context_when_memory_is_tight() {
511        let report = local_runtime_profile_report_for_host(
512            Some("local-qwen3.6"),
513            "qwen3.6-35b-a3b-ud-q4-k-xl",
514            "llamacpp",
515            Some(RuntimeProfileHost {
516                system_available_gib: None,
517                accelerator_free_gib: Some(24.0),
518            }),
519        );
520        assert_eq!(report.selected.recommended_num_ctx, Some(73_728));
521    }
522
523    #[test]
524    fn gemma4_ollama_profile_is_quarantined_until_tool_probe_passes() {
525        let report = local_runtime_profile_report("ollama-gemma4", None);
526        assert_eq!(report.selected_status, RuntimeProfileStatus::Quarantined);
527        let gate = evaluate_runtime_profile_gate(&report, &RuntimeProbeEvidence::new(), false);
528        assert!(!gate.allowed);
529        assert_eq!(gate.missing_required_probes, vec!["tool_probe".to_string()]);
530
531        let mut evidence = RuntimeProbeEvidence::new();
532        evidence.add_tool_report(classify_tool_conformance_fixture(
533            "ollama",
534            "gemma4:26b",
535            ToolProbeMode::NonStreaming,
536            "harn_tool_probe_marker",
537            r#"{"content":"echo_marker({ value: \"harn_tool_probe_marker\" })"}"#,
538        ));
539        let gate = evaluate_runtime_profile_gate(&report, &evidence, false);
540        assert!(gate.allowed, "{gate:?}");
541    }
542
543    #[test]
544    fn force_allows_risky_profile_with_receipt() {
545        let report = local_runtime_profile_report("local-qwen3.6", None);
546        assert_eq!(report.selected_status, RuntimeProfileStatus::Experimental);
547        let gate = evaluate_runtime_profile_gate(&report, &RuntimeProbeEvidence::new(), true);
548        assert!(gate.allowed);
549        assert!(gate.forced);
550    }
551}