1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
use std::path::PathBuf;
use clap::{Args, Subcommand};
use super::util::llm_model_completion_parser;
/// `harn local` — manage local LLM runtimes (Ollama, llama.cpp,
/// MLX, generic OpenAI-compatible servers) through one stable
/// abstraction while underlying CLIs keep changing.
#[derive(Debug, Args)]
pub(crate) struct LocalArgs {
#[command(subcommand)]
pub command: LocalCommand,
}
#[derive(Debug, Subcommand)]
pub(crate) enum LocalCommand {
/// Survey every local provider Harn knows about: base URL, reachability,
/// served models, loaded models, memory footprint, context, keep-alive.
List(LocalListArgs),
/// Launch a Harn-managed local server process and verify it is ready.
Launch(Box<LocalLaunchArgs>),
/// Show the currently-selected local provider/model and a brief summary
/// of every other local runtime.
Status(LocalStatusArgs),
/// Make `<alias>` the active local model: warm it on its provider,
/// unload conflicting models, and persist the selection.
Switch(LocalSwitchArgs),
/// Explain the selected local runtime profile and required probes.
Profile(LocalProfileArgs),
/// Unload loaded local models. By default targets the active provider;
/// pass `--all` to unload every reachable local provider.
Stop(LocalStopArgs),
}
#[derive(Debug, Args)]
pub(crate) struct LocalListArgs {
/// Emit a structured JSON snapshot instead of a human table.
#[arg(long)]
pub json: bool,
/// Restrict to one provider id (e.g. `ollama`, `llamacpp`, `mlx`).
#[arg(long)]
pub provider: Option<String>,
}
#[derive(Debug, Args)]
pub(crate) struct LocalLaunchArgs {
/// Model alias or provider-native model id to serve.
#[arg(
value_parser = llm_model_completion_parser(),
hide_possible_values = true
)]
pub model: String,
/// Local provider runtime to launch or warm (`ollama`, `llamacpp`, `mlx`).
#[arg(long)]
pub provider: Option<String>,
/// Local model file, directory, or Hugging Face repo id for launched servers.
#[arg(long = "model-source", alias = "model-path")]
pub model_source: Option<String>,
/// Server command to execute.
#[arg(long = "server-command")]
pub server_command: Option<String>,
/// Host interface for the launched server. Defaults to the provider base URL host.
#[arg(long)]
pub host: Option<String>,
/// Port for the launched server. Defaults to the provider catalog base URL.
#[arg(long)]
pub port: Option<u16>,
/// Context window to request from the runtime.
#[arg(long)]
pub ctx: Option<u64>,
/// Keep-alive value for Ollama warmup (e.g. `30m`, `forever`, `-1`).
#[arg(long = "keep-alive")]
pub keep_alive: Option<String>,
/// Skip pulling the model when it is missing (Ollama only).
#[arg(long = "no-pull")]
pub no_pull: bool,
/// Number of parallel slots.
#[arg(long, default_value_t = 1)]
pub parallel: u64,
/// llama.cpp GPU layer setting (`auto`, `all`, or a number).
#[arg(long = "gpu-layers", default_value = "auto")]
pub gpu_layers: String,
/// llama.cpp K-cache type.
#[arg(long = "cache-type-k")]
pub cache_type_k: Option<String>,
/// llama.cpp V-cache type.
#[arg(long = "cache-type-v")]
pub cache_type_v: Option<String>,
/// llama.cpp prompt/KV cache RAM MiB cap.
#[arg(long = "cache-ram")]
pub cache_ram: Option<u64>,
/// llama.cpp reasoning mode (`on`, `off`, or `auto`).
#[arg(long)]
pub reasoning: Option<String>,
/// llama.cpp reasoning extraction format, for example `deepseek`.
#[arg(long = "reasoning-format")]
pub reasoning_format: Option<String>,
/// llama.cpp flash-attention mode (`on`, `off`, or `auto`).
#[arg(long = "flash-attn")]
pub flash_attn: Option<String>,
/// Enable the llama.cpp Jinja chat template parser.
#[arg(long)]
pub jinja: bool,
/// Enable the llama.cpp Prometheus metrics endpoint.
#[arg(long)]
pub metrics: bool,
/// Extra argument to pass through to the server command. Repeat as needed.
#[arg(long = "server-arg", allow_hyphen_values = true)]
pub server_args: Vec<String>,
/// Readiness timeout in seconds.
#[arg(long = "timeout-secs", default_value_t = 120)]
pub timeout_secs: u64,
/// Log file path. Defaults under Harn local state.
#[arg(long)]
pub log: Option<PathBuf>,
/// Skip unloading other local providers / sibling models before launch.
#[arg(long = "no-evict")]
pub no_evict: bool,
/// Allow a launch even when catalog memory estimates exceed current RAM headroom.
#[arg(long = "allow-memory-risk")]
pub allow_memory_risk: bool,
/// Emit a structured JSON result.
#[arg(long)]
pub json: bool,
}
#[derive(Debug, Args)]
pub(crate) struct LocalStatusArgs {
/// Emit a structured JSON snapshot instead of human text.
#[arg(long)]
pub json: bool,
}
#[derive(Debug, Args)]
pub(crate) struct LocalSwitchArgs {
/// Model alias or provider-native model id (e.g. `qwen36-coder`,
/// `ollama:llama3.2`, `mlx-qwen36-27b`).
#[arg(
value_parser = llm_model_completion_parser(),
hide_possible_values = true
)]
pub model: String,
/// Override the inferred provider (e.g. force `--provider llamacpp` for
/// a GGUF id that would otherwise route to `ollama`).
#[arg(long)]
pub provider: Option<String>,
/// Context window override (Ollama: `num_ctx`). Defaults come from the
/// machine profile derived from `harn models recommend`.
#[arg(long)]
pub ctx: Option<u64>,
/// Keep-alive value to apply on the target provider (Ollama only at the
/// moment; e.g. `30m`, `forever`, `-1`).
#[arg(long = "keep-alive")]
pub keep_alive: Option<String>,
/// Skip pulling the model when it is missing (Ollama only).
#[arg(long = "no-pull")]
pub no_pull: bool,
/// Skip unloading other local providers / sibling models.
#[arg(long = "no-evict")]
pub no_evict: bool,
/// Allow an experimental or quarantined runtime without passing the
/// profile's required probes.
#[arg(long)]
pub force: bool,
/// JSON output from `harn provider-tool-probe`; can satisfy the
/// profile's `tool_probe` requirement.
#[arg(long = "probe-result")]
pub probe_results: Vec<PathBuf>,
/// Mark an externally-run probe as passed, for example
/// `--passed-probe two_turn_cache_probe`.
#[arg(long = "passed-probe")]
pub passed_probes: Vec<String>,
/// Emit a structured JSON result.
#[arg(long)]
pub json: bool,
}
#[derive(Debug, Args)]
pub(crate) struct LocalProfileArgs {
/// Model alias or provider-native model id.
#[arg(
value_parser = llm_model_completion_parser(),
hide_possible_values = true
)]
pub model: String,
/// Override the inferred provider/runtime.
#[arg(long)]
pub provider: Option<String>,
/// Emit a structured JSON result.
#[arg(long)]
pub json: bool,
}
#[derive(Debug, Args)]
pub(crate) struct LocalStopArgs {
/// Unload every reachable local provider, not just the active one.
#[arg(long)]
pub all: bool,
/// Target one provider id (overrides `--all`).
#[arg(long)]
pub provider: Option<String>,
/// Emit a structured JSON result.
#[arg(long)]
pub json: bool,
}