1use std::time::Duration;
5
6use serde::Serialize;
7use serde_json::Value;
8
9#[derive(Debug, Clone, Serialize)]
10pub struct OllamaWarmupResult {
11 pub valid: bool,
12 pub status: String,
13 pub message: String,
14 pub url: String,
15 pub model: String,
16 #[serde(skip_serializing_if = "Option::is_none")]
17 pub http_status: Option<u16>,
18}
19
20#[derive(Debug, Clone, Serialize)]
21pub struct OllamaReadinessResult {
22 pub valid: bool,
23 pub status: String,
24 pub message: String,
25 pub base_url: String,
26 pub tags_url: String,
27 pub model: String,
28 #[serde(skip_serializing_if = "Option::is_none")]
29 pub matched_model: Option<String>,
30 pub available_models: Vec<String>,
31 #[serde(skip_serializing_if = "Option::is_none")]
32 pub http_status: Option<u16>,
33 #[serde(skip_serializing_if = "Option::is_none")]
34 pub keep_alive: Option<serde_json::Value>,
35 #[serde(skip_serializing_if = "Option::is_none")]
36 pub warmup: Option<OllamaWarmupResult>,
37 #[serde(skip_serializing_if = "Option::is_none")]
40 pub expected: Option<OllamaExpectedRequest>,
41 #[serde(skip_serializing_if = "Option::is_none")]
45 pub loaded_runner: Option<OllamaLoadedRunner>,
46 #[serde(skip_serializing_if = "Option::is_none")]
49 pub context_drift: Option<String>,
50}
51
52#[derive(Debug, Clone, Serialize)]
56pub struct OllamaExpectedRequest {
57 pub num_ctx: u64,
58 pub keep_alive: Value,
59}
60
61#[derive(Debug, Clone, Serialize)]
66pub struct OllamaLoadedRunner {
67 pub name: String,
68 pub model: String,
69 #[serde(skip_serializing_if = "Option::is_none")]
70 pub context_length: Option<u64>,
71 #[serde(skip_serializing_if = "Option::is_none")]
72 pub size_vram: Option<u64>,
73 #[serde(skip_serializing_if = "Option::is_none")]
74 pub size: Option<u64>,
75 #[serde(skip_serializing_if = "Option::is_none")]
76 pub expires_at: Option<String>,
77}
78
79#[derive(Debug, Clone)]
80pub struct OllamaReadinessOptions {
81 pub model: String,
82 pub base_url: Option<String>,
83 pub warm: bool,
84 pub keep_alive: Option<serde_json::Value>,
85 pub tags_timeout: Duration,
86 pub warmup_timeout: Duration,
87 pub observe_loaded: bool,
90}
91
92impl OllamaReadinessOptions {
93 pub fn new(model: impl Into<String>) -> Self {
94 Self {
95 model: model.into(),
96 base_url: None,
97 warm: false,
98 keep_alive: None,
99 tags_timeout: Duration::from_secs(15),
100 warmup_timeout: Duration::from_secs(135),
101 observe_loaded: false,
102 }
103 }
104}
105
106pub fn normalize_ollama_keep_alive(raw: &str) -> Option<serde_json::Value> {
110 parse_keep_alive_str(raw)
111}
112
113pub const OLLAMA_DEFAULT_NUM_CTX: u64 = 32_768;
114pub const OLLAMA_DEFAULT_KEEP_ALIVE: &str = "30m";
115pub const OLLAMA_DEFAULT_UNLOAD_GRACE_MS: u64 = 10_000;
116pub const HARN_OLLAMA_NUM_CTX_ENV: &str = "HARN_OLLAMA_NUM_CTX";
117pub const HARN_OLLAMA_KEEP_ALIVE_ENV: &str = "HARN_OLLAMA_KEEP_ALIVE";
118pub const HARN_OLLAMA_UNLOAD_GRACE_MS_ENV: &str = "HARN_OLLAMA_UNLOAD_GRACE_MS";
119pub const OLLAMA_UNLOAD_GRACE_MS_ENV: &str = "OLLAMA_UNLOAD_GRACE_MS";
120pub const OLLAMA_HOST_ENV: &str = "OLLAMA_HOST";
121
122const OLLAMA_NUM_CTX_ENV_KEYS: [&str; 3] = [
123 HARN_OLLAMA_NUM_CTX_ENV,
124 "OLLAMA_CONTEXT_LENGTH",
125 "OLLAMA_NUM_CTX",
126];
127const OLLAMA_KEEP_ALIVE_ENV_KEYS: [&str; 2] = [HARN_OLLAMA_KEEP_ALIVE_ENV, "OLLAMA_KEEP_ALIVE"];
128const OLLAMA_UNLOAD_GRACE_MS_ENV_KEYS: [&str; 2] =
129 [HARN_OLLAMA_UNLOAD_GRACE_MS_ENV, OLLAMA_UNLOAD_GRACE_MS_ENV];
130const OLLAMA_DEFAULT_BASE_URL: &str = "http://localhost:11434";
131
132#[derive(Clone, Debug, PartialEq)]
133pub struct OllamaRuntimeSettings {
134 pub num_ctx: u64,
135 pub keep_alive: Value,
136}
137
138impl OllamaRuntimeSettings {
139 pub fn from_env() -> Self {
140 Self::from_env_and_overrides(None)
141 }
142
143 pub fn from_env_and_overrides(overrides: Option<&Value>) -> Self {
144 Self::from_env_overrides_and_model(overrides, None)
145 }
146
147 pub fn from_env_overrides_and_model(overrides: Option<&Value>, model: Option<&str>) -> Self {
148 Self {
149 num_ctx: num_ctx_from_overrides(overrides)
150 .or_else(num_ctx_from_env)
151 .or_else(|| num_ctx_from_model_catalog(model))
152 .unwrap_or(OLLAMA_DEFAULT_NUM_CTX),
153 keep_alive: keep_alive_from_overrides(overrides)
154 .or_else(keep_alive_from_env)
155 .unwrap_or_else(default_keep_alive_value),
156 }
157 }
158
159 pub fn warmup_body(&self, model: &str) -> Value {
160 serde_json::json!({
161 "model": model,
162 "prompt": "",
163 "stream": false,
164 "keep_alive": self.keep_alive,
165 "options": {
166 "num_ctx": self.num_ctx,
167 },
168 })
169 }
170}
171
172pub fn ollama_runtime_settings_from_env() -> OllamaRuntimeSettings {
173 OllamaRuntimeSettings::from_env()
174}
175
176pub(crate) fn ollama_unload_grace_duration_from_env() -> Duration {
177 Duration::from_millis(
178 OLLAMA_UNLOAD_GRACE_MS_ENV_KEYS
179 .iter()
180 .find_map(|key| std::env::var(key).ok().and_then(|raw| parse_grace_ms(&raw)))
181 .unwrap_or(OLLAMA_DEFAULT_UNLOAD_GRACE_MS),
182 )
183}
184
185pub async fn warm_ollama_model(model: &str, base_url: Option<&str>) -> Result<(), String> {
186 let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some(model));
187 warm_ollama_model_with_settings(model, base_url, &settings).await
188}
189
190pub async fn warm_ollama_model_with_settings(
191 model: &str,
192 base_url: Option<&str>,
193 settings: &OllamaRuntimeSettings,
194) -> Result<(), String> {
195 let base_url = resolve_ollama_base_url(base_url);
196 let url = format!("{}/api/generate", base_url.trim_end_matches('/'));
197 let response = crate::llm::shared_utility_client()
198 .post(url)
199 .header("Content-Type", "application/json")
200 .json(&settings.warmup_body(model))
201 .send()
202 .await
203 .map_err(|error| format!("Ollama warmup failed: {error}"))?;
204 if response.status().is_success() {
205 Ok(())
206 } else {
207 let status = response.status();
208 let body = response.text().await.unwrap_or_default();
209 Err(format!("Ollama warmup returned HTTP {status}: {body}"))
210 }
211}
212
213pub(crate) fn apply_ollama_runtime_settings(body: &mut Value, overrides: Option<&Value>) {
214 apply_non_runtime_ollama_overrides(body, overrides);
215
216 let explicit_num_ctx = num_ctx_from_overrides(overrides);
217 if explicit_num_ctx.is_some() || body.pointer("/options/num_ctx").is_none() {
218 let num_ctx = explicit_num_ctx
219 .or_else(num_ctx_from_env)
220 .or_else(|| num_ctx_from_model_catalog(body.get("model").and_then(Value::as_str)))
221 .unwrap_or(OLLAMA_DEFAULT_NUM_CTX);
222 ensure_options_object(body).insert("num_ctx".to_string(), serde_json::json!(num_ctx));
223 }
224
225 let explicit_keep_alive = keep_alive_from_overrides(overrides);
226 if let Some(keep_alive) = explicit_keep_alive
227 .or_else(|| body.get("keep_alive").cloned())
228 .or_else(keep_alive_from_env)
229 .or_else(|| Some(default_keep_alive_value()))
230 {
231 body["keep_alive"] = keep_alive;
232 }
233}
234
235fn resolve_ollama_base_url(base_url: Option<&str>) -> String {
236 base_url
237 .map(str::trim)
238 .filter(|value| !value.is_empty())
239 .map(str::to_string)
240 .or_else(|| {
241 std::env::var(OLLAMA_HOST_ENV)
242 .ok()
243 .map(|value| value.trim().to_string())
244 .filter(|value| !value.is_empty())
245 })
246 .unwrap_or_else(|| OLLAMA_DEFAULT_BASE_URL.to_string())
247}
248
249fn num_ctx_from_env() -> Option<u64> {
250 OLLAMA_NUM_CTX_ENV_KEYS
251 .iter()
252 .find_map(|key| std::env::var(key).ok().and_then(|raw| parse_num_ctx(&raw)))
253}
254
255fn num_ctx_from_model_catalog(model: Option<&str>) -> Option<u64> {
256 let model = model?.trim();
257 if model.is_empty() {
258 return None;
259 }
260 let entry = crate::llm_config::model_catalog_entry(model)?;
261 entry
262 .runtime_context_window
263 .filter(|window| *window > 0)
264 .or_else(|| (entry.context_window > 0).then_some(entry.context_window))
265}
266
267fn keep_alive_from_env() -> Option<Value> {
268 OLLAMA_KEEP_ALIVE_ENV_KEYS.iter().find_map(|key| {
269 std::env::var(key)
270 .ok()
271 .and_then(|raw| parse_keep_alive_str(&raw))
272 })
273}
274
275fn num_ctx_from_overrides(overrides: Option<&Value>) -> Option<u64> {
276 let obj = overrides?.as_object()?;
277 obj.get("num_ctx")
278 .and_then(parse_num_ctx_value)
279 .or_else(|| {
280 obj.get("options")
281 .and_then(|options| options.get("num_ctx"))
282 .and_then(parse_num_ctx_value)
283 })
284}
285
286fn keep_alive_from_overrides(overrides: Option<&Value>) -> Option<Value> {
287 overrides?
288 .as_object()?
289 .get("keep_alive")
290 .and_then(parse_keep_alive_value)
291}
292
293fn parse_num_ctx(raw: &str) -> Option<u64> {
294 raw.trim().parse::<u64>().ok().filter(|parsed| *parsed > 0)
295}
296
297fn parse_grace_ms(raw: &str) -> Option<u64> {
298 raw.trim().parse::<u64>().ok()
299}
300
301fn parse_num_ctx_value(value: &Value) -> Option<u64> {
302 match value {
303 Value::Number(number) => number.as_u64().filter(|parsed| *parsed > 0),
304 Value::String(raw) => parse_num_ctx(raw),
305 _ => None,
306 }
307}
308
309fn parse_keep_alive_value(value: &Value) -> Option<Value> {
310 match value {
311 Value::String(raw) => parse_keep_alive_str(raw),
312 Value::Number(_) => Some(value.clone()),
313 _ => None,
314 }
315}
316
317fn parse_keep_alive_str(raw: &str) -> Option<Value> {
318 let trimmed = raw.trim();
319 if trimmed.is_empty() {
320 return None;
321 }
322 Some(match trimmed.to_ascii_lowercase().as_str() {
323 "default" => default_keep_alive_value(),
324 "forever" | "infinite" | "-1" => serde_json::json!(-1),
325 _ => {
326 if let Ok(n) = trimmed.parse::<i64>() {
327 serde_json::json!(n)
328 } else {
329 serde_json::json!(trimmed)
330 }
331 }
332 })
333}
334
335fn default_keep_alive_value() -> Value {
336 serde_json::json!(OLLAMA_DEFAULT_KEEP_ALIVE)
337}
338
339fn ensure_options_object(body: &mut Value) -> &mut serde_json::Map<String, Value> {
340 if !body.get("options").is_some_and(Value::is_object) {
341 body["options"] = serde_json::json!({});
342 }
343 body["options"]
344 .as_object_mut()
345 .expect("options initialized as object")
346}
347
348fn apply_non_runtime_ollama_overrides(body: &mut Value, overrides: Option<&Value>) {
349 let Some(obj) = overrides.and_then(Value::as_object) else {
350 return;
351 };
352
353 for (key, value) in obj {
354 match key.as_str() {
355 "num_ctx" | "keep_alive" => {}
356 "options" => {
357 if let Some(options) = value.as_object() {
358 let body_options = ensure_options_object(body);
359 for (option_key, option_value) in options {
360 if option_key != "num_ctx" {
361 body_options.insert(option_key.clone(), option_value.clone());
362 }
363 }
364 }
365 }
366 _ => {
367 body[key] = value.clone();
368 }
369 }
370 }
371}
372
373pub async fn ollama_readiness(options: OllamaReadinessOptions) -> OllamaReadinessResult {
374 let base_url = options.base_url.unwrap_or_else(default_ollama_base_url);
375 let mut result = OllamaReadinessResult::probing(base_url.clone(), options.model.clone());
376
377 let tags_url = match ollama_endpoint_url(&base_url, "/api/tags") {
378 Ok(url) => url,
379 Err(message) => return result.fail("invalid_url", message),
380 };
381 result.tags_url = tags_url.clone();
382
383 let client = crate::llm::shared_utility_client();
384 let response = match client
385 .get(tags_url.clone())
386 .timeout(options.tags_timeout)
387 .send()
388 .await
389 {
390 Ok(response) => response,
391 Err(error) => {
392 return result.fail(
393 "daemon_down",
394 format!("Ollama not reachable at {tags_url}: {error}"),
395 );
396 }
397 };
398
399 let status = response.status();
400 result.http_status = Some(status.as_u16());
401 if !status.is_success() {
402 let body = response.text().await.unwrap_or_default();
403 return result.fail(
404 "bad_status",
405 format!(
406 "Ollama returned HTTP {} from /api/tags: {body}",
407 status.as_u16()
408 ),
409 );
410 }
411
412 let body: Value = match response.json().await {
413 Ok(value) => value,
414 Err(error) => {
415 return result.fail(
416 "invalid_response",
417 format!("Could not parse Ollama model list: {error}"),
418 );
419 }
420 };
421
422 let Some(models) = parse_ollama_model_names(&body) else {
423 return result.fail(
424 "invalid_response",
425 "Could not parse Ollama model list: missing models[].name".to_string(),
426 );
427 };
428 result.available_models = models.clone();
429
430 let Some(matched) = find_ollama_model_match(&models, &options.model) else {
431 let available = if models.is_empty() {
432 "(none)".to_string()
433 } else {
434 models.join(", ")
435 };
436 return result.fail(
437 "model_missing",
438 format!(
439 "Ollama model '{}' not found. Available: {available}",
440 options.model
441 ),
442 );
443 };
444 result.matched_model = Some(matched.clone());
445
446 let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some(&matched));
447 result.expected = Some(OllamaExpectedRequest {
448 num_ctx: settings.num_ctx,
449 keep_alive: settings.keep_alive.clone(),
450 });
451 let keep_alive = options
452 .keep_alive
453 .clone()
454 .unwrap_or_else(|| settings.keep_alive.clone());
455 result.keep_alive = Some(keep_alive.clone());
456
457 result.message = format!("Ollama is reachable and model '{matched}' is available");
458 if options.warm {
459 let warm = ollama_warmup(
460 &base_url,
461 &matched,
462 Some(keep_alive),
463 options.warmup_timeout,
464 )
465 .await;
466 if !warm.valid {
467 result.valid = false;
468 result.status = "warmup_failed".to_string();
469 result.message = warm.message.clone();
470 } else {
471 result.message = format!("{}; {}", result.message, warm.message);
472 }
473 result.warmup = Some(warm);
474 }
475
476 if options.observe_loaded {
477 match fetch_ollama_loaded_runners(&base_url, options.tags_timeout).await {
478 Ok(runners) => {
479 if let Some(runner) = match_loaded_runner(runners, &matched) {
480 if let Some(actual) = runner.context_length {
481 if actual != settings.num_ctx {
482 result.context_drift =
483 Some(describe_context_drift(settings.num_ctx, actual));
484 }
485 }
486 result.loaded_runner = Some(runner);
487 }
488 }
489 Err(error) => {
490 result.message = format!("{}; /api/ps probe skipped: {error}", result.message);
493 }
494 }
495 }
496
497 result
498}
499
500impl OllamaReadinessResult {
501 fn probing(base_url: String, model: String) -> Self {
502 Self {
503 valid: true,
504 status: "ok".to_string(),
505 message: String::new(),
506 base_url,
507 tags_url: String::new(),
508 model,
509 matched_model: None,
510 available_models: Vec::new(),
511 http_status: None,
512 keep_alive: None,
513 warmup: None,
514 expected: None,
515 loaded_runner: None,
516 context_drift: None,
517 }
518 }
519
520 fn fail(mut self, status: &str, message: String) -> Self {
521 self.valid = false;
522 self.status = status.to_string();
523 self.message = message;
524 self
525 }
526}
527
528fn default_ollama_base_url() -> String {
529 crate::llm_config::provider_config("ollama")
530 .as_ref()
531 .map(crate::llm_config::resolve_base_url)
532 .unwrap_or_else(|| "http://localhost:11434".to_string())
533}
534
535fn ollama_endpoint_url(base_url: &str, path: &str) -> Result<String, String> {
536 let mut url = reqwest::Url::parse(base_url)
537 .map_err(|error| format!("Invalid Ollama URL '{base_url}': {error}"))?;
538 if url.host_str() == Some("localhost") {
539 url.set_host(Some("127.0.0.1"))
540 .map_err(|_| format!("Invalid Ollama URL '{base_url}': could not normalize host"))?;
541 }
542 let base_path = url.path().trim_end_matches('/');
543 let suffix = path.trim_start_matches('/');
544 let joined = if base_path.is_empty() {
545 format!("/{suffix}")
546 } else {
547 format!("{base_path}/{suffix}")
548 };
549 url.set_path(&joined);
550 url.set_query(None);
551 Ok(url.to_string())
552}
553
554fn parse_ollama_model_names(value: &serde_json::Value) -> Option<Vec<String>> {
555 let models = value.get("models")?.as_array()?;
556 Some(
557 models
558 .iter()
559 .filter_map(|model| model.get("name").and_then(|name| name.as_str()))
560 .map(str::to_string)
561 .collect(),
562 )
563}
564
565fn find_ollama_model_match(models: &[String], selected: &str) -> Option<String> {
566 models
567 .iter()
568 .find(|name| name.as_str() == selected)
569 .or_else(|| {
570 models
571 .iter()
572 .find(|name| name.strip_suffix(":latest") == Some(selected))
573 })
574 .or_else(|| models.iter().find(|name| name.starts_with(selected)))
575 .cloned()
576}
577
578pub async fn fetch_ollama_loaded_runners(
582 base_url: &str,
583 timeout: Duration,
584) -> Result<Vec<OllamaLoadedRunner>, String> {
585 let url = ollama_endpoint_url(base_url, "/api/ps")?;
586 let response = crate::llm::shared_utility_client()
587 .get(&url)
588 .timeout(timeout)
589 .send()
590 .await
591 .map_err(|error| format!("Ollama /api/ps not reachable at {url}: {error}"))?;
592 if !response.status().is_success() {
593 return Err(format!(
594 "Ollama returned HTTP {} from /api/ps",
595 response.status().as_u16()
596 ));
597 }
598 let body: Value = response
599 .json()
600 .await
601 .map_err(|error| format!("Could not parse Ollama /api/ps response: {error}"))?;
602 Ok(parse_ollama_loaded_runners(&body))
603}
604
605fn parse_ollama_loaded_runners(value: &Value) -> Vec<OllamaLoadedRunner> {
606 let Some(models) = value.get("models").and_then(Value::as_array) else {
607 return Vec::new();
608 };
609 models
610 .iter()
611 .filter_map(|entry| {
612 let name = entry.get("name").and_then(Value::as_str)?.to_string();
613 let model = entry
614 .get("model")
615 .and_then(Value::as_str)
616 .map(str::to_string)
617 .unwrap_or_else(|| name.clone());
618 Some(OllamaLoadedRunner {
619 name,
620 model,
621 context_length: entry.get("context_length").and_then(Value::as_u64),
622 size_vram: entry.get("size_vram").and_then(Value::as_u64),
623 size: entry.get("size").and_then(Value::as_u64),
624 expires_at: entry
625 .get("expires_at")
626 .and_then(Value::as_str)
627 .map(str::to_string),
628 })
629 })
630 .collect()
631}
632
633fn match_loaded_runner(
634 runners: Vec<OllamaLoadedRunner>,
635 model: &str,
636) -> Option<OllamaLoadedRunner> {
637 runners
638 .into_iter()
639 .find(|runner| runner.name == model || runner.model == model)
640}
641
642fn describe_context_drift(expected: u64, actual: u64) -> String {
643 if actual > expected {
644 format!(
645 "Loaded runner context_length={actual} exceeds expected num_ctx={expected}. \
646 Ollama keeps a runner at the context it was loaded with; run \
647 `ollama stop <model>` (or wait for keep_alive to expire) and let Harn \
648 re-warm it to apply the smaller context."
649 )
650 } else {
651 format!(
652 "Loaded runner context_length={actual} is smaller than expected \
653 num_ctx={expected}. The runner was loaded at a smaller context — \
654 unload with `ollama stop <model>` and let Harn re-warm to expand."
655 )
656 }
657}
658
659async fn ollama_warmup(
660 base_url: &str,
661 model: &str,
662 keep_alive: Option<serde_json::Value>,
663 timeout: Duration,
664) -> OllamaWarmupResult {
665 let url = match ollama_endpoint_url(base_url, "/api/generate") {
666 Ok(url) => url,
667 Err(message) => {
668 return OllamaWarmupResult {
669 valid: false,
670 status: "invalid_url".to_string(),
671 message,
672 url: String::new(),
673 model: model.to_string(),
674 http_status: None,
675 };
676 }
677 };
678
679 let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some(model));
686 let mut body = settings.warmup_body(model);
687 if let Some(value) = keep_alive {
688 body["keep_alive"] = value;
689 }
690
691 let client = crate::llm::shared_blocking_client();
692 let response = match client
693 .post(url.clone())
694 .header("Content-Type", "application/json")
695 .timeout(timeout)
696 .json(&body)
697 .send()
698 .await
699 {
700 Ok(response) => response,
701 Err(error) => {
702 return OllamaWarmupResult {
703 valid: false,
704 status: "warmup_failed".to_string(),
705 message: format!("Ollama warmup failed for model '{model}' at {url}: {error}"),
706 url,
707 model: model.to_string(),
708 http_status: None,
709 };
710 }
711 };
712
713 let status = response.status();
714 if !status.is_success() {
715 let body = response.text().await.unwrap_or_default();
716 return OllamaWarmupResult {
717 valid: false,
718 status: "warmup_failed".to_string(),
719 message: format!(
720 "Ollama warmup returned HTTP {} for model '{model}': {body}",
721 status.as_u16()
722 ),
723 url,
724 model: model.to_string(),
725 http_status: Some(status.as_u16()),
726 };
727 }
728
729 let body: serde_json::Value = response.json().await.unwrap_or_default();
730 if let Some(error) = body.get("error").and_then(|error| error.as_str()) {
731 return OllamaWarmupResult {
732 valid: false,
733 status: "warmup_failed".to_string(),
734 message: format!("Ollama warmup failed for model '{model}': {error}"),
735 url,
736 model: model.to_string(),
737 http_status: Some(status.as_u16()),
738 };
739 }
740
741 OllamaWarmupResult {
742 valid: true,
743 status: "ok".to_string(),
744 message: format!("Ollama model '{model}' warmed"),
745 url,
746 model: model.to_string(),
747 http_status: Some(status.as_u16()),
748 }
749}
750
751#[cfg(test)]
752mod tests {
753 use super::*;
754 use crate::llm::env_lock;
755 use std::io::{Read, Write};
756 use std::net::TcpListener;
757 use std::sync::{Arc, Mutex};
758 use std::time::Duration;
759
760 struct ScopedEnvVar {
761 key: &'static str,
762 previous: Option<String>,
763 }
764
765 impl ScopedEnvVar {
766 fn set(key: &'static str, value: &str) -> Self {
767 let previous = std::env::var(key).ok();
768 unsafe {
769 std::env::set_var(key, value);
770 }
771 Self { key, previous }
772 }
773
774 fn remove(key: &'static str) -> Self {
775 let previous = std::env::var(key).ok();
776 unsafe {
777 std::env::remove_var(key);
778 }
779 Self { key, previous }
780 }
781 }
782
783 impl Drop for ScopedEnvVar {
784 fn drop(&mut self) {
785 match &self.previous {
786 Some(value) => unsafe { std::env::set_var(self.key, value) },
787 None => unsafe { std::env::remove_var(self.key) },
788 }
789 }
790 }
791
792 #[test]
793 fn runtime_settings_use_harn_env_before_ollama_env() {
794 let _guard = env_lock().lock().expect("env lock");
795 let _env = [
796 ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "131072"),
797 ScopedEnvVar::set("OLLAMA_CONTEXT_LENGTH", "32768"),
798 ScopedEnvVar::set("HARN_OLLAMA_KEEP_ALIVE", "forever"),
799 ScopedEnvVar::set("OLLAMA_KEEP_ALIVE", "5m"),
800 ];
801 let settings = OllamaRuntimeSettings::from_env();
802 assert_eq!(settings.num_ctx, 131072);
803 assert_eq!(settings.keep_alive, serde_json::json!(-1));
804 }
805
806 #[test]
807 fn runtime_settings_apply_harn_defaults() {
808 let _guard = env_lock().lock().expect("env lock");
809 let _env = [
810 ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
811 ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
812 ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
813 ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
814 ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
815 ];
816 let settings = OllamaRuntimeSettings::from_env();
817 assert_eq!(settings.num_ctx, OLLAMA_DEFAULT_NUM_CTX);
818 assert_eq!(settings.keep_alive, serde_json::json!("30m"));
819 }
820
821 #[test]
822 fn runtime_settings_use_catalog_context_after_env_and_overrides() {
823 let _guard = env_lock().lock().expect("env lock");
824 let _env = [
825 ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
826 ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
827 ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
828 ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
829 ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
830 ];
831 crate::llm_config::clear_user_overrides();
832 let mut overlay = crate::llm_config::ProvidersConfig::default();
833 overlay.models.insert(
834 "qwen-test".to_string(),
835 crate::llm_config::ModelDef {
836 name: "Qwen Test".to_string(),
837 provider: "ollama".to_string(),
838 context_window: 100_000,
839 runtime_context_window: None,
840 stream_timeout: None,
841 capabilities: vec![],
842 pricing: None,
843 deprecated: false,
844 deprecation_note: None,
845 quality_tags: Vec::new(),
846 prefer_prefill_done: None,
847 },
848 );
849 crate::llm_config::set_user_overrides(Some(overlay));
850
851 let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some("qwen-test"));
852 assert_eq!(settings.num_ctx, 100_000);
853
854 let env = ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "65536");
855 let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some("qwen-test"));
856 assert_eq!(settings.num_ctx, 65_536);
857 drop(env);
858
859 let overrides = serde_json::json!({"num_ctx": 8192});
860 let settings = OllamaRuntimeSettings::from_env_overrides_and_model(
861 Some(&overrides),
862 Some("qwen-test"),
863 );
864 assert_eq!(settings.num_ctx, 8_192);
865
866 crate::llm_config::clear_user_overrides();
867 }
868
869 #[test]
870 fn provider_overrides_beat_env_and_normalize_keep_alive() {
871 let _guard = env_lock().lock().expect("env lock");
872 let _env = [
873 ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "131072"),
874 ScopedEnvVar::set("HARN_OLLAMA_KEEP_ALIVE", "5m"),
875 ];
876 let overrides = serde_json::json!({
877 "num_ctx": "65536",
878 "keep_alive": "infinite",
879 });
880 let settings = OllamaRuntimeSettings::from_env_and_overrides(Some(&overrides));
881 assert_eq!(settings.num_ctx, 65536);
882 assert_eq!(settings.keep_alive, serde_json::json!(-1));
883 }
884
885 #[test]
886 fn apply_runtime_settings_maps_ollama_overrides_to_native_shape() {
887 let _guard = env_lock().lock().expect("env lock");
888 let _env = [
889 ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
890 ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
891 ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
892 ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
893 ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
894 ];
895 let mut body = serde_json::json!({
896 "model": "qwen",
897 "options": {"temperature": 0.1}
898 });
899 let overrides = serde_json::json!({
900 "num_ctx": 65536,
901 "keep_alive": "default",
902 "options": {"top_k": 20, "num_ctx": 999},
903 "think": true,
904 });
905 apply_ollama_runtime_settings(&mut body, Some(&overrides));
906 assert_eq!(body["options"]["num_ctx"], serde_json::json!(65536));
907 assert_eq!(body["options"]["top_k"], serde_json::json!(20));
908 assert_eq!(body["options"]["temperature"], serde_json::json!(0.1));
909 assert_eq!(body["keep_alive"], serde_json::json!("30m"));
910 assert_eq!(body["think"], serde_json::json!(true));
911 assert!(body.get("num_ctx").is_none());
912 }
913
914 #[test]
915 fn apply_runtime_settings_uses_catalog_context_when_body_has_model() {
916 let _guard = env_lock().lock().expect("env lock");
917 let _env = [
918 ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
919 ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
920 ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
921 ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
922 ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
923 ];
924 crate::llm_config::clear_user_overrides();
925 let mut overlay = crate::llm_config::ProvidersConfig::default();
926 overlay.models.insert(
927 "qwen-test".to_string(),
928 crate::llm_config::ModelDef {
929 name: "Qwen Test".to_string(),
930 provider: "ollama".to_string(),
931 context_window: 100_000,
932 runtime_context_window: Some(32_768),
933 stream_timeout: None,
934 capabilities: vec![],
935 pricing: None,
936 deprecated: false,
937 deprecation_note: None,
938 quality_tags: Vec::new(),
939 prefer_prefill_done: None,
940 },
941 );
942 crate::llm_config::set_user_overrides(Some(overlay));
943
944 let mut body = serde_json::json!({
945 "model": "qwen-test",
946 "options": {"temperature": 0.1}
947 });
948 apply_ollama_runtime_settings(&mut body, None);
949 assert_eq!(body["options"]["num_ctx"], serde_json::json!(32768));
950 assert_eq!(body["options"]["temperature"], serde_json::json!(0.1));
951
952 crate::llm_config::clear_user_overrides();
953 }
954
955 #[test]
956 fn ollama_keep_alive_normalization_handles_default_and_numbers() {
957 assert_eq!(
958 normalize_ollama_keep_alive("default"),
959 Some(serde_json::json!("30m"))
960 );
961 assert_eq!(
962 normalize_ollama_keep_alive("forever"),
963 Some(serde_json::json!(-1))
964 );
965 assert_eq!(
966 normalize_ollama_keep_alive("120"),
967 Some(serde_json::json!(120))
968 );
969 assert_eq!(
970 normalize_ollama_keep_alive("10m"),
971 Some(serde_json::json!("10m"))
972 );
973 assert_eq!(normalize_ollama_keep_alive(" "), None);
974 }
975
976 fn readiness_options(model: &str, base_url: String) -> OllamaReadinessOptions {
977 OllamaReadinessOptions {
978 model: model.to_string(),
979 base_url: Some(base_url),
980 warm: false,
981 keep_alive: None,
982 tags_timeout: Duration::from_secs(2),
983 warmup_timeout: Duration::from_secs(2),
984 observe_loaded: false,
985 }
986 }
987
988 #[test]
989 fn ollama_readiness_verifies_model_and_warms_matched_tag() {
990 let _guard = env_lock().lock().expect("env lock");
991 let _env = [
992 ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "65536"),
993 ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
994 ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
995 ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
996 ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
997 ];
998 let captured = Arc::new(Mutex::new(Vec::new()));
999 let (addr, server) = spawn_stub(
1000 vec![
1001 (
1002 200,
1003 r#"{"models":[{"name":"qwen3:latest"},{"name":"llama3.2:latest"}]}"#,
1004 ),
1005 (200, r#"{"response":"","done":true}"#),
1006 ],
1007 captured.clone(),
1008 );
1009
1010 let result = tokio::runtime::Runtime::new()
1011 .expect("runtime")
1012 .block_on(ollama_readiness(OllamaReadinessOptions {
1013 warm: true,
1014 keep_alive: Some(serde_json::json!(-1)),
1015 ..readiness_options("qwen3", format!("http://{addr}"))
1016 }));
1017
1018 server.join().expect("stub server");
1019 assert!(result.valid, "result was: {result:?}");
1020 assert_eq!(result.status, "ok");
1021 assert_eq!(result.matched_model.as_deref(), Some("qwen3:latest"));
1022 assert!(result.warmup.as_ref().is_some_and(|warm| warm.valid));
1023 let expected = result.expected.as_ref().expect("expected request");
1024 assert_eq!(expected.num_ctx, 65_536);
1025
1026 let requests = captured.lock().expect("captured requests");
1027 assert!(requests[0].starts_with("GET /api/tags "));
1028 assert!(requests[1].starts_with("POST /api/generate "));
1029 let body = requests[1].split("\r\n\r\n").nth(1).unwrap_or("");
1030 let json: serde_json::Value = serde_json::from_str(body).expect("warmup body");
1031 assert_eq!(json["model"], "qwen3:latest");
1032 assert_eq!(json["prompt"], "");
1033 assert_eq!(json["stream"], false);
1034 assert_eq!(json["keep_alive"], -1);
1035 assert_eq!(
1036 json["options"]["num_ctx"], 65_536,
1037 "warmup must inject num_ctx so Ollama loads the runner at the requested context — see issue #1600"
1038 );
1039 }
1040
1041 #[test]
1042 fn ollama_readiness_reports_missing_model_with_available_tags() {
1043 let captured = Arc::new(Mutex::new(Vec::new()));
1044 let (addr, server) = spawn_stub(
1045 vec![(200, r#"{"models":[{"name":"llama3.2:latest"}]}"#)],
1046 captured,
1047 );
1048
1049 let result = tokio::runtime::Runtime::new()
1050 .expect("runtime")
1051 .block_on(ollama_readiness(readiness_options(
1052 "qwen3",
1053 format!("http://{addr}"),
1054 )));
1055
1056 server.join().expect("stub server");
1057 assert!(!result.valid);
1058 assert_eq!(result.status, "model_missing");
1059 assert_eq!(result.available_models, vec!["llama3.2:latest"]);
1060 assert!(result.message.contains("qwen3"));
1061 }
1062
1063 #[test]
1064 fn ollama_readiness_observes_loaded_runner_and_reports_no_drift() {
1065 let _guard = env_lock().lock().expect("env lock");
1066 let _env = [
1067 ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "32768"),
1068 ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
1069 ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
1070 ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
1071 ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
1072 ];
1073 let captured = Arc::new(Mutex::new(Vec::new()));
1074 let (addr, server) = spawn_stub(
1075 vec![
1076 (200, r#"{"models":[{"name":"qwen3:latest"}]}"#),
1077 (
1078 200,
1079 r#"{"models":[{"name":"qwen3:latest","model":"qwen3:latest","context_length":32768,"size_vram":1234,"size":4321,"expires_at":"2026-05-13T12:00:00Z"}]}"#,
1080 ),
1081 ],
1082 captured.clone(),
1083 );
1084
1085 let result = tokio::runtime::Runtime::new()
1086 .expect("runtime")
1087 .block_on(ollama_readiness(OllamaReadinessOptions {
1088 observe_loaded: true,
1089 ..readiness_options("qwen3", format!("http://{addr}"))
1090 }));
1091
1092 server.join().expect("stub server");
1093 assert!(result.valid, "result was: {result:?}");
1094 let runner = result.loaded_runner.expect("loaded runner present");
1095 assert_eq!(runner.context_length, Some(32_768));
1096 assert_eq!(runner.size_vram, Some(1234));
1097 assert!(
1098 result.context_drift.is_none(),
1099 "no drift expected; got {:?}",
1100 result.context_drift
1101 );
1102
1103 let requests = captured.lock().expect("captured requests");
1104 assert!(requests[0].starts_with("GET /api/tags "));
1105 assert!(requests[1].starts_with("GET /api/ps "));
1106 }
1107
1108 #[test]
1109 fn ollama_readiness_flags_context_drift_when_loaded_exceeds_expected() {
1110 let _guard = env_lock().lock().expect("env lock");
1111 let _env = [
1112 ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "32768"),
1113 ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
1114 ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
1115 ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
1116 ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
1117 ];
1118 let (addr, server) = spawn_stub(
1119 vec![
1120 (
1121 200,
1122 r#"{"models":[{"name":"qwen3.6:35b-a3b-coding-nvfp4"}]}"#,
1123 ),
1124 (
1125 200,
1126 r#"{"models":[{"name":"qwen3.6:35b-a3b-coding-nvfp4","model":"qwen3.6:35b-a3b-coding-nvfp4","context_length":262144}]}"#,
1127 ),
1128 ],
1129 Arc::new(Mutex::new(Vec::new())),
1130 );
1131
1132 let result = tokio::runtime::Runtime::new()
1133 .expect("runtime")
1134 .block_on(ollama_readiness(OllamaReadinessOptions {
1135 observe_loaded: true,
1136 ..readiness_options("qwen3.6:35b-a3b-coding-nvfp4", format!("http://{addr}"))
1137 }));
1138
1139 server.join().expect("stub server");
1140 assert!(result.valid, "result was: {result:?}");
1141 let drift = result.context_drift.expect("drift expected");
1142 assert!(drift.contains("262144"), "drift message: {drift}");
1143 assert!(drift.contains("32768"), "drift message: {drift}");
1144 assert!(
1145 drift.contains("ollama stop"),
1146 "drift message must teach the user how to recover: {drift}"
1147 );
1148 }
1149
1150 #[test]
1151 fn ollama_readiness_uses_alias_resolved_runtime_settings() {
1152 let _guard = env_lock().lock().expect("env lock");
1153 let _env = [
1154 ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
1155 ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
1156 ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
1157 ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
1158 ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
1159 ];
1160 crate::llm_config::clear_user_overrides();
1161 let mut overlay = crate::llm_config::ProvidersConfig::default();
1162 overlay.aliases.insert(
1163 "qwen3.6-coding".to_string(),
1164 crate::llm_config::AliasDef {
1165 id: "qwen3.6:35b-a3b-coding-nvfp4".to_string(),
1166 provider: "ollama".to_string(),
1167 tool_format: None,
1168 },
1169 );
1170 overlay.models.insert(
1171 "qwen3.6:35b-a3b-coding-nvfp4".to_string(),
1172 crate::llm_config::ModelDef {
1173 name: "Qwen 3.6 Coding".to_string(),
1174 provider: "ollama".to_string(),
1175 context_window: 262_144,
1176 runtime_context_window: Some(98_304),
1177 stream_timeout: None,
1178 capabilities: vec![],
1179 pricing: None,
1180 deprecated: false,
1181 deprecation_note: None,
1182 quality_tags: Vec::new(),
1183 prefer_prefill_done: None,
1184 },
1185 );
1186 crate::llm_config::set_user_overrides(Some(overlay));
1187
1188 let (resolved, _) = crate::llm_config::resolve_model("qwen3.6-coding");
1189 assert_eq!(resolved, "qwen3.6:35b-a3b-coding-nvfp4");
1190
1191 let captured = Arc::new(Mutex::new(Vec::new()));
1192 let (addr, server) = spawn_stub(
1193 vec![
1194 (
1195 200,
1196 r#"{"models":[{"name":"qwen3.6:35b-a3b-coding-nvfp4"}]}"#,
1197 ),
1198 (200, r#"{"response":"","done":true}"#),
1199 ],
1200 captured.clone(),
1201 );
1202
1203 let result = tokio::runtime::Runtime::new()
1204 .expect("runtime")
1205 .block_on(ollama_readiness(OllamaReadinessOptions {
1206 warm: true,
1207 ..readiness_options(&resolved, format!("http://{addr}"))
1208 }));
1209
1210 server.join().expect("stub server");
1211 crate::llm_config::clear_user_overrides();
1212
1213 assert!(result.valid, "result was: {result:?}");
1214 let expected = result.expected.expect("expected request populated");
1215 assert_eq!(
1216 expected.num_ctx, 98_304,
1217 "alias-resolved model must pull runtime_context_window from the catalog"
1218 );
1219
1220 let requests = captured.lock().expect("captured requests");
1221 let warmup_body = requests[1].split("\r\n\r\n").nth(1).unwrap_or("");
1222 let json: serde_json::Value = serde_json::from_str(warmup_body).expect("warmup body");
1223 assert_eq!(json["options"]["num_ctx"], 98_304);
1224 }
1225
1226 #[test]
1227 fn fetch_ollama_loaded_runners_parses_optional_fields() {
1228 let captured = Arc::new(Mutex::new(Vec::new()));
1229 let (addr, server) = spawn_stub(
1230 vec![(
1231 200,
1232 r#"{"models":[{"name":"a:latest","model":"a:latest"},{"name":"b:latest","model":"b:latest","context_length":8192,"size_vram":42,"expires_at":"now"}]}"#,
1233 )],
1234 captured,
1235 );
1236
1237 let runners = tokio::runtime::Runtime::new()
1238 .expect("runtime")
1239 .block_on(fetch_ollama_loaded_runners(
1240 &format!("http://{addr}"),
1241 Duration::from_secs(2),
1242 ))
1243 .expect("ps response parses");
1244 server.join().expect("stub server");
1245
1246 assert_eq!(runners.len(), 2);
1247 assert_eq!(runners[0].name, "a:latest");
1248 assert!(runners[0].context_length.is_none());
1249 assert_eq!(runners[1].context_length, Some(8192));
1250 assert_eq!(runners[1].size_vram, Some(42));
1251 assert_eq!(runners[1].expires_at.as_deref(), Some("now"));
1252 }
1253
1254 fn spawn_stub(
1255 responses: Vec<(u16, &'static str)>,
1256 captured: Arc<Mutex<Vec<String>>>,
1257 ) -> (std::net::SocketAddr, std::thread::JoinHandle<()>) {
1258 let listener = TcpListener::bind("127.0.0.1:0").expect("bind ollama stub");
1259 let addr = listener.local_addr().expect("stub addr");
1260 let handle = std::thread::spawn(move || {
1261 for (status, body) in responses {
1262 let (mut stream, _) = listener.accept().expect("accept request");
1263 stream
1264 .set_read_timeout(Some(Duration::from_secs(2)))
1265 .expect("read timeout");
1266 let request = read_http_request(&mut stream);
1267 captured.lock().expect("captured").push(request);
1268 let reason = if status == 200 { "OK" } else { "ERROR" };
1269 let response = format!(
1270 "HTTP/1.1 {status} {reason}\r\ncontent-type: application/json\r\ncontent-length: {}\r\nconnection: close\r\n\r\n{body}",
1271 body.len()
1272 );
1273 stream
1274 .write_all(response.as_bytes())
1275 .expect("write response");
1276 }
1277 });
1278 (addr, handle)
1279 }
1280
1281 fn read_http_request(stream: &mut std::net::TcpStream) -> String {
1282 let mut data = Vec::new();
1283 let mut buf = [0_u8; 512];
1284 loop {
1285 let n = stream.read(&mut buf).expect("read request");
1286 if n == 0 {
1287 break;
1288 }
1289 data.extend_from_slice(&buf[..n]);
1290 let text = String::from_utf8_lossy(&data);
1291 if let Some(header_end) = text.find("\r\n\r\n") {
1292 let headers = &text[..header_end];
1293 let content_length = headers
1294 .lines()
1295 .find_map(|line| {
1296 let (name, value) = line.split_once(':')?;
1297 name.eq_ignore_ascii_case("content-length")
1298 .then(|| value.trim().parse::<usize>().ok())
1299 .flatten()
1300 })
1301 .unwrap_or(0);
1302 if data.len() >= header_end + 4 + content_length {
1303 break;
1304 }
1305 }
1306 }
1307 String::from_utf8(data).expect("utf8 request")
1308 }
1309}