Skip to main content

harn_vm/llm/api/
ollama.rs

1//! Ollama-specific runtime settings consumed by chat, completion, and
2//! model warmup paths.
3
4use std::time::Duration;
5
6use serde::Serialize;
7use serde_json::Value;
8
9#[derive(Debug, Clone, Serialize)]
10pub struct OllamaWarmupResult {
11    pub valid: bool,
12    pub status: String,
13    pub message: String,
14    pub url: String,
15    pub model: String,
16    #[serde(skip_serializing_if = "Option::is_none")]
17    pub http_status: Option<u16>,
18}
19
20#[derive(Debug, Clone, Serialize)]
21pub struct OllamaReadinessResult {
22    pub valid: bool,
23    pub status: String,
24    pub message: String,
25    pub base_url: String,
26    pub tags_url: String,
27    pub model: String,
28    #[serde(skip_serializing_if = "Option::is_none")]
29    pub matched_model: Option<String>,
30    pub available_models: Vec<String>,
31    #[serde(skip_serializing_if = "Option::is_none")]
32    pub http_status: Option<u16>,
33    #[serde(skip_serializing_if = "Option::is_none")]
34    pub keep_alive: Option<serde_json::Value>,
35    #[serde(skip_serializing_if = "Option::is_none")]
36    pub warmup: Option<OllamaWarmupResult>,
37    /// Runtime settings Harn would inject into a request body for this
38    /// model, computed from env, provider overrides, and the catalog.
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub expected: Option<OllamaExpectedRequest>,
41    /// The matched runner reported by `/api/ps`, if the model is currently
42    /// loaded. The `context_length` field is the effective context the
43    /// loaded runner was started with — this is what `ollama ps` prints.
44    #[serde(skip_serializing_if = "Option::is_none")]
45    pub loaded_runner: Option<OllamaLoadedRunner>,
46    /// Set when the loaded runner's `context_length` differs from the
47    /// `expected.num_ctx` Harn would request. Explains how to reload.
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub context_drift: Option<String>,
50}
51
52/// The runtime knobs Harn would attach to chat/completion/warmup
53/// requests for a given model. Surfaced by readiness so callers can
54/// compare it against what `/api/ps` says is actually loaded.
55#[derive(Debug, Clone, Serialize)]
56pub struct OllamaExpectedRequest {
57    pub num_ctx: u64,
58    pub keep_alive: Value,
59}
60
61/// One entry from `GET /api/ps` — a model the Ollama daemon currently
62/// has loaded into memory. `context_length` is the effective context
63/// the runner was started with and is fixed for the lifetime of the
64/// loaded process; reloading is required to change it.
65#[derive(Debug, Clone, Serialize)]
66pub struct OllamaLoadedRunner {
67    pub name: String,
68    pub model: String,
69    #[serde(skip_serializing_if = "Option::is_none")]
70    pub context_length: Option<u64>,
71    #[serde(skip_serializing_if = "Option::is_none")]
72    pub size_vram: Option<u64>,
73    #[serde(skip_serializing_if = "Option::is_none")]
74    pub size: Option<u64>,
75    #[serde(skip_serializing_if = "Option::is_none")]
76    pub expires_at: Option<String>,
77}
78
79#[derive(Debug, Clone)]
80pub struct OllamaReadinessOptions {
81    pub model: String,
82    pub base_url: Option<String>,
83    pub warm: bool,
84    pub keep_alive: Option<serde_json::Value>,
85    pub tags_timeout: Duration,
86    pub warmup_timeout: Duration,
87    /// Hit `/api/ps` and report any drift between the loaded runner's
88    /// `context_length` and the `num_ctx` Harn would request.
89    pub observe_loaded: bool,
90}
91
92impl OllamaReadinessOptions {
93    pub fn new(model: impl Into<String>) -> Self {
94        Self {
95            model: model.into(),
96            base_url: None,
97            warm: false,
98            keep_alive: None,
99            tags_timeout: Duration::from_secs(15),
100            warmup_timeout: Duration::from_secs(135),
101            observe_loaded: false,
102        }
103    }
104}
105
106/// Public wrapper around the internal keep-alive parser, used by callers
107/// (CLI flags, host bridges) that want the same normalization Harn applies
108/// to environment overrides.
109pub fn normalize_ollama_keep_alive(raw: &str) -> Option<serde_json::Value> {
110    parse_keep_alive_str(raw)
111}
112
113pub const OLLAMA_DEFAULT_NUM_CTX: u64 = 32_768;
114pub const OLLAMA_DEFAULT_KEEP_ALIVE: &str = "30m";
115pub const OLLAMA_DEFAULT_UNLOAD_GRACE_MS: u64 = 10_000;
116pub const HARN_OLLAMA_NUM_CTX_ENV: &str = "HARN_OLLAMA_NUM_CTX";
117pub const HARN_OLLAMA_KEEP_ALIVE_ENV: &str = "HARN_OLLAMA_KEEP_ALIVE";
118pub const HARN_OLLAMA_UNLOAD_GRACE_MS_ENV: &str = "HARN_OLLAMA_UNLOAD_GRACE_MS";
119pub const OLLAMA_UNLOAD_GRACE_MS_ENV: &str = "OLLAMA_UNLOAD_GRACE_MS";
120pub const OLLAMA_HOST_ENV: &str = "OLLAMA_HOST";
121
122const OLLAMA_NUM_CTX_ENV_KEYS: [&str; 3] = [
123    HARN_OLLAMA_NUM_CTX_ENV,
124    "OLLAMA_CONTEXT_LENGTH",
125    "OLLAMA_NUM_CTX",
126];
127const OLLAMA_KEEP_ALIVE_ENV_KEYS: [&str; 2] = [HARN_OLLAMA_KEEP_ALIVE_ENV, "OLLAMA_KEEP_ALIVE"];
128const OLLAMA_UNLOAD_GRACE_MS_ENV_KEYS: [&str; 2] =
129    [HARN_OLLAMA_UNLOAD_GRACE_MS_ENV, OLLAMA_UNLOAD_GRACE_MS_ENV];
130const OLLAMA_DEFAULT_BASE_URL: &str = "http://localhost:11434";
131
132#[derive(Clone, Debug, PartialEq, Eq)]
133pub struct OllamaRuntimeSettings {
134    pub num_ctx: u64,
135    pub keep_alive: Value,
136}
137
138impl OllamaRuntimeSettings {
139    pub fn from_env() -> Self {
140        Self::from_env_and_overrides(None)
141    }
142
143    pub fn from_env_and_overrides(overrides: Option<&Value>) -> Self {
144        Self::from_env_overrides_and_model(overrides, None)
145    }
146
147    pub fn from_env_overrides_and_model(overrides: Option<&Value>, model: Option<&str>) -> Self {
148        Self {
149            num_ctx: num_ctx_from_overrides(overrides)
150                .or_else(num_ctx_from_env)
151                .or_else(|| num_ctx_from_model_catalog(model))
152                .unwrap_or(OLLAMA_DEFAULT_NUM_CTX),
153            keep_alive: keep_alive_from_overrides(overrides)
154                .or_else(keep_alive_from_env)
155                .unwrap_or_else(default_keep_alive_value),
156        }
157    }
158
159    pub fn warmup_body(&self, model: &str) -> Value {
160        serde_json::json!({
161            "model": model,
162            "prompt": "",
163            "stream": false,
164            "keep_alive": self.keep_alive,
165            "options": {
166                "num_ctx": self.num_ctx,
167            },
168        })
169    }
170}
171
172pub fn ollama_runtime_settings_from_env() -> OllamaRuntimeSettings {
173    OllamaRuntimeSettings::from_env()
174}
175
176pub(crate) fn ollama_unload_grace_duration_from_env() -> Duration {
177    Duration::from_millis(
178        OLLAMA_UNLOAD_GRACE_MS_ENV_KEYS
179            .iter()
180            .find_map(|key| std::env::var(key).ok().and_then(|raw| parse_grace_ms(&raw)))
181            .unwrap_or(OLLAMA_DEFAULT_UNLOAD_GRACE_MS),
182    )
183}
184
185pub async fn warm_ollama_model(model: &str, base_url: Option<&str>) -> Result<(), String> {
186    let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some(model));
187    warm_ollama_model_with_settings(model, base_url, &settings).await
188}
189
190pub async fn warm_ollama_model_with_settings(
191    model: &str,
192    base_url: Option<&str>,
193    settings: &OllamaRuntimeSettings,
194) -> Result<(), String> {
195    let base_url = resolve_ollama_base_url(base_url);
196    let url = format!("{}/api/generate", base_url.trim_end_matches('/'));
197    let response = crate::llm::shared_utility_client()
198        .post(url)
199        .header("Content-Type", "application/json")
200        .json(&settings.warmup_body(model))
201        .send()
202        .await
203        .map_err(|error| format!("Ollama warmup failed: {error}"))?;
204    if response.status().is_success() {
205        Ok(())
206    } else {
207        let status = response.status();
208        let body = response.text().await.unwrap_or_default();
209        Err(format!("Ollama warmup returned HTTP {status}: {body}"))
210    }
211}
212
213pub(crate) fn apply_ollama_runtime_settings(body: &mut Value, overrides: Option<&Value>) {
214    apply_non_runtime_ollama_overrides(body, overrides);
215
216    let explicit_num_ctx = num_ctx_from_overrides(overrides);
217    if explicit_num_ctx.is_some() || body.pointer("/options/num_ctx").is_none() {
218        let num_ctx = explicit_num_ctx
219            .or_else(num_ctx_from_env)
220            .or_else(|| num_ctx_from_model_catalog(body.get("model").and_then(Value::as_str)))
221            .unwrap_or(OLLAMA_DEFAULT_NUM_CTX);
222        ensure_options_object(body).insert("num_ctx".to_string(), serde_json::json!(num_ctx));
223    }
224
225    let explicit_keep_alive = keep_alive_from_overrides(overrides);
226    if let Some(keep_alive) = explicit_keep_alive
227        .or_else(|| body.get("keep_alive").cloned())
228        .or_else(keep_alive_from_env)
229        .or_else(|| Some(default_keep_alive_value()))
230    {
231        body["keep_alive"] = keep_alive;
232    }
233}
234
235fn resolve_ollama_base_url(base_url: Option<&str>) -> String {
236    base_url
237        .map(str::trim)
238        .filter(|value| !value.is_empty())
239        .map(str::to_string)
240        .or_else(|| {
241            std::env::var(OLLAMA_HOST_ENV)
242                .ok()
243                .map(|value| value.trim().to_string())
244                .filter(|value| !value.is_empty())
245        })
246        .unwrap_or_else(|| OLLAMA_DEFAULT_BASE_URL.to_string())
247}
248
249fn num_ctx_from_env() -> Option<u64> {
250    OLLAMA_NUM_CTX_ENV_KEYS
251        .iter()
252        .find_map(|key| std::env::var(key).ok().and_then(|raw| parse_num_ctx(&raw)))
253}
254
255fn num_ctx_from_model_catalog(model: Option<&str>) -> Option<u64> {
256    let model = model?.trim();
257    if model.is_empty() {
258        return None;
259    }
260    let entry = crate::llm_config::model_catalog_entry(model)?;
261    entry
262        .runtime_context_window
263        .filter(|window| *window > 0)
264        .or_else(|| (entry.context_window > 0).then_some(entry.context_window))
265}
266
267fn keep_alive_from_env() -> Option<Value> {
268    OLLAMA_KEEP_ALIVE_ENV_KEYS.iter().find_map(|key| {
269        std::env::var(key)
270            .ok()
271            .and_then(|raw| parse_keep_alive_str(&raw))
272    })
273}
274
275fn num_ctx_from_overrides(overrides: Option<&Value>) -> Option<u64> {
276    let obj = overrides?.as_object()?;
277    obj.get("num_ctx")
278        .and_then(parse_num_ctx_value)
279        .or_else(|| {
280            obj.get("options")
281                .and_then(|options| options.get("num_ctx"))
282                .and_then(parse_num_ctx_value)
283        })
284}
285
286fn keep_alive_from_overrides(overrides: Option<&Value>) -> Option<Value> {
287    overrides?
288        .as_object()?
289        .get("keep_alive")
290        .and_then(parse_keep_alive_value)
291}
292
293fn parse_num_ctx(raw: &str) -> Option<u64> {
294    raw.trim().parse::<u64>().ok().filter(|parsed| *parsed > 0)
295}
296
297fn parse_grace_ms(raw: &str) -> Option<u64> {
298    raw.trim().parse::<u64>().ok()
299}
300
301fn parse_num_ctx_value(value: &Value) -> Option<u64> {
302    match value {
303        Value::Number(number) => number.as_u64().filter(|parsed| *parsed > 0),
304        Value::String(raw) => parse_num_ctx(raw),
305        _ => None,
306    }
307}
308
309fn parse_keep_alive_value(value: &Value) -> Option<Value> {
310    match value {
311        Value::String(raw) => parse_keep_alive_str(raw),
312        Value::Number(_) => Some(value.clone()),
313        _ => None,
314    }
315}
316
317fn parse_keep_alive_str(raw: &str) -> Option<Value> {
318    let trimmed = raw.trim();
319    if trimmed.is_empty() {
320        return None;
321    }
322    Some(match trimmed.to_ascii_lowercase().as_str() {
323        "default" => default_keep_alive_value(),
324        "forever" | "infinite" | "-1" => serde_json::json!(-1),
325        _ => {
326            if let Ok(n) = trimmed.parse::<i64>() {
327                serde_json::json!(n)
328            } else {
329                serde_json::json!(trimmed)
330            }
331        }
332    })
333}
334
335fn default_keep_alive_value() -> Value {
336    serde_json::json!(OLLAMA_DEFAULT_KEEP_ALIVE)
337}
338
339fn ensure_options_object(body: &mut Value) -> &mut serde_json::Map<String, Value> {
340    if !body.get("options").is_some_and(Value::is_object) {
341        body["options"] = serde_json::json!({});
342    }
343    body["options"]
344        .as_object_mut()
345        .expect("options initialized as object")
346}
347
348fn apply_non_runtime_ollama_overrides(body: &mut Value, overrides: Option<&Value>) {
349    let Some(obj) = overrides.and_then(Value::as_object) else {
350        return;
351    };
352
353    for (key, value) in obj {
354        match key.as_str() {
355            "num_ctx" | "keep_alive" => {}
356            "options" => {
357                if let Some(options) = value.as_object() {
358                    let body_options = ensure_options_object(body);
359                    for (option_key, option_value) in options {
360                        if option_key != "num_ctx" {
361                            body_options.insert(option_key.clone(), option_value.clone());
362                        }
363                    }
364                }
365            }
366            _ => {
367                body[key] = value.clone();
368            }
369        }
370    }
371}
372
373pub async fn ollama_readiness(options: OllamaReadinessOptions) -> OllamaReadinessResult {
374    let base_url = options.base_url.unwrap_or_else(default_ollama_base_url);
375    let mut result = OllamaReadinessResult::probing(base_url.clone(), options.model.clone());
376
377    let tags_url = match ollama_endpoint_url(&base_url, "/api/tags") {
378        Ok(url) => url,
379        Err(message) => return result.fail("invalid_url", message),
380    };
381    result.tags_url = tags_url.clone();
382
383    let client = crate::llm::shared_utility_client();
384    let response = match client
385        .get(tags_url.clone())
386        .timeout(options.tags_timeout)
387        .send()
388        .await
389    {
390        Ok(response) => response,
391        Err(error) => {
392            return result.fail(
393                "daemon_down",
394                format!("Ollama not reachable at {tags_url}: {error}"),
395            );
396        }
397    };
398
399    let status = response.status();
400    result.http_status = Some(status.as_u16());
401    if !status.is_success() {
402        let body = response.text().await.unwrap_or_default();
403        return result.fail(
404            "bad_status",
405            format!(
406                "Ollama returned HTTP {} from /api/tags: {body}",
407                status.as_u16()
408            ),
409        );
410    }
411
412    let body: Value = match response.json().await {
413        Ok(value) => value,
414        Err(error) => {
415            return result.fail(
416                "invalid_response",
417                format!("Could not parse Ollama model list: {error}"),
418            );
419        }
420    };
421
422    let Some(models) = parse_ollama_model_names(&body) else {
423        return result.fail(
424            "invalid_response",
425            "Could not parse Ollama model list: missing models[].name".to_string(),
426        );
427    };
428    result.available_models = models.clone();
429
430    let Some(matched) = find_ollama_model_match(&models, &options.model) else {
431        let available = if models.is_empty() {
432            "(none)".to_string()
433        } else {
434            models.join(", ")
435        };
436        return result.fail(
437            "model_missing",
438            format!(
439                "Ollama model '{}' not found. Available: {available}",
440                options.model
441            ),
442        );
443    };
444    result.matched_model = Some(matched.clone());
445
446    let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some(&matched));
447    let keep_alive = options
448        .keep_alive
449        .clone()
450        .unwrap_or_else(|| settings.keep_alive.clone());
451    result.expected = Some(OllamaExpectedRequest {
452        num_ctx: settings.num_ctx,
453        keep_alive: keep_alive.clone(),
454    });
455    result.keep_alive = Some(keep_alive.clone());
456
457    result.message = format!("Ollama is reachable and model '{matched}' is available");
458    if options.warm {
459        let warm = ollama_warmup(
460            &base_url,
461            &matched,
462            Some(keep_alive),
463            options.warmup_timeout,
464        )
465        .await;
466        if !warm.valid {
467            result.valid = false;
468            result.status = "warmup_failed".to_string();
469            result.message = warm.message.clone();
470        } else {
471            result.message = format!("{}; {}", result.message, warm.message);
472        }
473        result.warmup = Some(warm);
474    }
475
476    if options.observe_loaded {
477        match fetch_ollama_loaded_runners(&base_url, options.tags_timeout).await {
478            Ok(runners) => {
479                if let Some(runner) = match_loaded_runner(runners, &matched) {
480                    if let Some(actual) = runner.context_length {
481                        if actual != settings.num_ctx {
482                            result.context_drift =
483                                Some(describe_context_drift(settings.num_ctx, actual));
484                        }
485                    }
486                    result.loaded_runner = Some(runner);
487                }
488            }
489            Err(error) => {
490                // /api/ps is best-effort; surface the failure in the
491                // message but don't fail the overall readiness check.
492                result.message = format!("{}; /api/ps probe skipped: {error}", result.message);
493            }
494        }
495    }
496
497    result
498}
499
500impl OllamaReadinessResult {
501    fn probing(base_url: String, model: String) -> Self {
502        Self {
503            valid: true,
504            status: "ok".to_string(),
505            message: String::new(),
506            base_url,
507            tags_url: String::new(),
508            model,
509            matched_model: None,
510            available_models: Vec::new(),
511            http_status: None,
512            keep_alive: None,
513            warmup: None,
514            expected: None,
515            loaded_runner: None,
516            context_drift: None,
517        }
518    }
519
520    fn fail(mut self, status: &str, message: String) -> Self {
521        self.valid = false;
522        self.status = status.to_string();
523        self.message = message;
524        self
525    }
526}
527
528fn default_ollama_base_url() -> String {
529    crate::llm_config::provider_config("ollama")
530        .as_ref()
531        .map(crate::llm_config::resolve_base_url)
532        .unwrap_or_else(|| "http://localhost:11434".to_string())
533}
534
535fn ollama_endpoint_url(base_url: &str, path: &str) -> Result<String, String> {
536    let mut url = reqwest::Url::parse(base_url)
537        .map_err(|error| format!("Invalid Ollama URL '{base_url}': {error}"))?;
538    if url.host_str() == Some("localhost") {
539        url.set_host(Some("127.0.0.1"))
540            .map_err(|_| format!("Invalid Ollama URL '{base_url}': could not normalize host"))?;
541    }
542    let base_path = url.path().trim_end_matches('/');
543    let suffix = path.trim_start_matches('/');
544    let joined = if base_path.is_empty() {
545        format!("/{suffix}")
546    } else {
547        format!("{base_path}/{suffix}")
548    };
549    url.set_path(&joined);
550    url.set_query(None);
551    Ok(url.to_string())
552}
553
554fn parse_ollama_model_names(value: &serde_json::Value) -> Option<Vec<String>> {
555    let models = value.get("models")?.as_array()?;
556    Some(
557        models
558            .iter()
559            .filter_map(|model| model.get("name").and_then(|name| name.as_str()))
560            .map(str::to_string)
561            .collect(),
562    )
563}
564
565fn find_ollama_model_match(models: &[String], selected: &str) -> Option<String> {
566    models
567        .iter()
568        .find(|name| name.as_str() == selected)
569        .or_else(|| {
570            models
571                .iter()
572                .find(|name| name.strip_suffix(":latest") == Some(selected))
573        })
574        .or_else(|| models.iter().find(|name| name.starts_with(selected)))
575        .cloned()
576}
577
578/// Fetch the list of currently-loaded runners from Ollama's `/api/ps`
579/// endpoint. Returns an empty list when no models are loaded; returns an
580/// error when the daemon is unreachable or the response is malformed.
581pub async fn fetch_ollama_loaded_runners(
582    base_url: &str,
583    timeout: Duration,
584) -> Result<Vec<OllamaLoadedRunner>, String> {
585    let url = ollama_endpoint_url(base_url, "/api/ps")?;
586    let response = crate::llm::shared_utility_client()
587        .get(&url)
588        .timeout(timeout)
589        .send()
590        .await
591        .map_err(|error| format!("Ollama /api/ps not reachable at {url}: {error}"))?;
592    if !response.status().is_success() {
593        return Err(format!(
594            "Ollama returned HTTP {} from /api/ps",
595            response.status().as_u16()
596        ));
597    }
598    let body: Value = response
599        .json()
600        .await
601        .map_err(|error| format!("Could not parse Ollama /api/ps response: {error}"))?;
602    Ok(parse_ollama_loaded_runners(&body))
603}
604
605fn parse_ollama_loaded_runners(value: &Value) -> Vec<OllamaLoadedRunner> {
606    let Some(models) = value.get("models").and_then(Value::as_array) else {
607        return Vec::new();
608    };
609    models
610        .iter()
611        .filter_map(|entry| {
612            let name = entry.get("name").and_then(Value::as_str)?.to_string();
613            let model = entry
614                .get("model")
615                .and_then(Value::as_str)
616                .map(str::to_string)
617                .unwrap_or_else(|| name.clone());
618            Some(OllamaLoadedRunner {
619                name,
620                model,
621                context_length: entry.get("context_length").and_then(Value::as_u64),
622                size_vram: entry.get("size_vram").and_then(Value::as_u64),
623                size: entry.get("size").and_then(Value::as_u64),
624                expires_at: entry
625                    .get("expires_at")
626                    .and_then(Value::as_str)
627                    .map(str::to_string),
628            })
629        })
630        .collect()
631}
632
633fn match_loaded_runner(
634    runners: Vec<OllamaLoadedRunner>,
635    model: &str,
636) -> Option<OllamaLoadedRunner> {
637    runners
638        .into_iter()
639        .find(|runner| runner.name == model || runner.model == model)
640}
641
642fn describe_context_drift(expected: u64, actual: u64) -> String {
643    if actual > expected {
644        format!(
645            "Loaded runner context_length={actual} exceeds expected num_ctx={expected}. \
646             Ollama keeps a runner at the context it was loaded with; run \
647             `ollama stop <model>` (or wait for keep_alive to expire) and let Harn \
648             re-warm it to apply the smaller context."
649        )
650    } else {
651        format!(
652            "Loaded runner context_length={actual} is smaller than expected \
653             num_ctx={expected}. The runner was loaded at a smaller context — \
654             unload with `ollama stop <model>` and let Harn re-warm to expand."
655        )
656    }
657}
658
659async fn ollama_warmup(
660    base_url: &str,
661    model: &str,
662    keep_alive: Option<serde_json::Value>,
663    timeout: Duration,
664) -> OllamaWarmupResult {
665    let url = match ollama_endpoint_url(base_url, "/api/generate") {
666        Ok(url) => url,
667        Err(message) => {
668            return OllamaWarmupResult {
669                valid: false,
670                status: "invalid_url".to_string(),
671                message,
672                url: String::new(),
673                model: model.to_string(),
674                http_status: None,
675            };
676        }
677    };
678
679    // Derive the warmup body from runtime settings so num_ctx is loaded
680    // into the runner the same way chat/completion requests would. Without
681    // it, Ollama loads the model at its Modelfile-declared maximum
682    // context, and a subsequent chat request asking for a smaller
683    // num_ctx cannot shrink an already-loaded runner — see the
684    // "Effective vs. loaded context" section of docs/src/llm/providers.md.
685    let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some(model));
686    let mut body = settings.warmup_body(model);
687    if let Some(value) = keep_alive {
688        body["keep_alive"] = value;
689    }
690
691    let client = crate::llm::shared_blocking_client();
692    let response = match client
693        .post(url.clone())
694        .header("Content-Type", "application/json")
695        .timeout(timeout)
696        .json(&body)
697        .send()
698        .await
699    {
700        Ok(response) => response,
701        Err(error) => {
702            return OllamaWarmupResult {
703                valid: false,
704                status: "warmup_failed".to_string(),
705                message: format!("Ollama warmup failed for model '{model}' at {url}: {error}"),
706                url,
707                model: model.to_string(),
708                http_status: None,
709            };
710        }
711    };
712
713    let status = response.status();
714    if !status.is_success() {
715        let body = response.text().await.unwrap_or_default();
716        return OllamaWarmupResult {
717            valid: false,
718            status: "warmup_failed".to_string(),
719            message: format!(
720                "Ollama warmup returned HTTP {} for model '{model}': {body}",
721                status.as_u16()
722            ),
723            url,
724            model: model.to_string(),
725            http_status: Some(status.as_u16()),
726        };
727    }
728
729    let body: serde_json::Value = response.json().await.unwrap_or_default();
730    if let Some(error) = body.get("error").and_then(|error| error.as_str()) {
731        return OllamaWarmupResult {
732            valid: false,
733            status: "warmup_failed".to_string(),
734            message: format!("Ollama warmup failed for model '{model}': {error}"),
735            url,
736            model: model.to_string(),
737            http_status: Some(status.as_u16()),
738        };
739    }
740
741    OllamaWarmupResult {
742        valid: true,
743        status: "ok".to_string(),
744        message: format!("Ollama model '{model}' warmed"),
745        url,
746        model: model.to_string(),
747        http_status: Some(status.as_u16()),
748    }
749}
750
751#[cfg(test)]
752mod tests {
753    use super::*;
754    use crate::http::framing::{http_content_length_from_header_lines, TEST_HTTP_MAX_BODY_BYTES};
755    use crate::llm::env_guard;
756    use std::io::{Read, Write};
757    use std::net::TcpListener;
758    use std::sync::{Arc, Mutex};
759    use std::time::Duration;
760
761    struct ScopedEnvVar {
762        key: &'static str,
763        previous: Option<String>,
764    }
765
766    impl ScopedEnvVar {
767        fn set(key: &'static str, value: &str) -> Self {
768            let previous = std::env::var(key).ok();
769            unsafe {
770                std::env::set_var(key, value);
771            }
772            Self { key, previous }
773        }
774
775        fn remove(key: &'static str) -> Self {
776            let previous = std::env::var(key).ok();
777            unsafe {
778                std::env::remove_var(key);
779            }
780            Self { key, previous }
781        }
782    }
783
784    impl Drop for ScopedEnvVar {
785        fn drop(&mut self) {
786            match &self.previous {
787                Some(value) => unsafe { std::env::set_var(self.key, value) },
788                None => unsafe { std::env::remove_var(self.key) },
789            }
790        }
791    }
792
793    #[test]
794    fn runtime_settings_use_harn_env_before_ollama_env() {
795        let _guard = env_guard();
796        let _env = [
797            ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "131072"),
798            ScopedEnvVar::set("OLLAMA_CONTEXT_LENGTH", "32768"),
799            ScopedEnvVar::set("HARN_OLLAMA_KEEP_ALIVE", "forever"),
800            ScopedEnvVar::set("OLLAMA_KEEP_ALIVE", "5m"),
801        ];
802        let settings = OllamaRuntimeSettings::from_env();
803        assert_eq!(settings.num_ctx, 131072);
804        assert_eq!(settings.keep_alive, serde_json::json!(-1));
805    }
806
807    #[test]
808    fn runtime_settings_apply_harn_defaults() {
809        let _guard = env_guard();
810        let _env = [
811            ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
812            ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
813            ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
814            ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
815            ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
816        ];
817        let settings = OllamaRuntimeSettings::from_env();
818        assert_eq!(settings.num_ctx, OLLAMA_DEFAULT_NUM_CTX);
819        assert_eq!(settings.keep_alive, serde_json::json!("30m"));
820    }
821
822    #[test]
823    fn runtime_settings_use_catalog_context_after_env_and_overrides() {
824        let _guard = env_guard();
825        let _env = [
826            ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
827            ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
828            ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
829            ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
830            ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
831        ];
832        crate::llm_config::clear_user_overrides();
833        let mut overlay = crate::llm_config::ProvidersConfig::default();
834        overlay.models.insert(
835            "qwen-test".to_string(),
836            crate::llm_config::ModelDef {
837                name: "Qwen Test".to_string(),
838                provider: "ollama".to_string(),
839                context_window: 100_000,
840                logical_model: None,
841                equivalence_group: None,
842                served_variant: None,
843                wire_model: None,
844                api_dialect: None,
845                rate_limits: None,
846                performance: None,
847                architecture: None,
848                local_memory: None,
849                runtime_context_window: None,
850                stream_timeout: None,
851                capabilities: vec![],
852                pricing: None,
853                deprecated: false,
854                deprecation_note: None,
855                superseded_by: None,
856                fast_mode: None,
857                quality_tags: Vec::new(),
858                availability: crate::llm_config::ModelAvailability::default(),
859                tier: None,
860                open_weight: None,
861                strengths: Vec::new(),
862                benchmarks: std::collections::BTreeMap::new(),
863                family: None,
864                lineage: None,
865                complementary_with: Vec::new(),
866                avoid_as_reviewer_for: Vec::new(),
867            },
868        );
869        crate::llm_config::set_user_overrides(Some(overlay));
870
871        let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some("qwen-test"));
872        assert_eq!(settings.num_ctx, 100_000);
873
874        let env = ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "65536");
875        let settings = OllamaRuntimeSettings::from_env_overrides_and_model(None, Some("qwen-test"));
876        assert_eq!(settings.num_ctx, 65_536);
877        drop(env);
878
879        let overrides = serde_json::json!({"num_ctx": 8192});
880        let settings = OllamaRuntimeSettings::from_env_overrides_and_model(
881            Some(&overrides),
882            Some("qwen-test"),
883        );
884        assert_eq!(settings.num_ctx, 8_192);
885
886        crate::llm_config::clear_user_overrides();
887    }
888
889    #[test]
890    fn provider_overrides_beat_env_and_normalize_keep_alive() {
891        let _guard = env_guard();
892        let _env = [
893            ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "131072"),
894            ScopedEnvVar::set("HARN_OLLAMA_KEEP_ALIVE", "5m"),
895        ];
896        let overrides = serde_json::json!({
897            "num_ctx": "65536",
898            "keep_alive": "infinite",
899        });
900        let settings = OllamaRuntimeSettings::from_env_and_overrides(Some(&overrides));
901        assert_eq!(settings.num_ctx, 65536);
902        assert_eq!(settings.keep_alive, serde_json::json!(-1));
903    }
904
905    #[test]
906    fn apply_runtime_settings_maps_ollama_overrides_to_native_shape() {
907        let _guard = env_guard();
908        let _env = [
909            ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
910            ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
911            ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
912            ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
913            ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
914        ];
915        let mut body = serde_json::json!({
916            "model": "qwen",
917            "options": {"temperature": 0.1}
918        });
919        let overrides = serde_json::json!({
920            "num_ctx": 65536,
921            "keep_alive": "default",
922            "options": {"top_k": 20, "num_ctx": 999},
923            "think": true,
924        });
925        apply_ollama_runtime_settings(&mut body, Some(&overrides));
926        assert_eq!(body["options"]["num_ctx"], serde_json::json!(65536));
927        assert_eq!(body["options"]["top_k"], serde_json::json!(20));
928        assert_eq!(body["options"]["temperature"], serde_json::json!(0.1));
929        assert_eq!(body["keep_alive"], serde_json::json!("30m"));
930        assert_eq!(body["think"], serde_json::json!(true));
931        assert!(body.get("num_ctx").is_none());
932    }
933
934    #[test]
935    fn apply_runtime_settings_uses_catalog_context_when_body_has_model() {
936        let _guard = env_guard();
937        let _env = [
938            ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
939            ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
940            ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
941            ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
942            ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
943        ];
944        crate::llm_config::clear_user_overrides();
945        let mut overlay = crate::llm_config::ProvidersConfig::default();
946        overlay.models.insert(
947            "qwen-test".to_string(),
948            crate::llm_config::ModelDef {
949                name: "Qwen Test".to_string(),
950                provider: "ollama".to_string(),
951                context_window: 100_000,
952                logical_model: None,
953                equivalence_group: None,
954                served_variant: None,
955                wire_model: None,
956                api_dialect: None,
957                rate_limits: None,
958                performance: None,
959                architecture: None,
960                local_memory: None,
961                runtime_context_window: Some(32_768),
962                stream_timeout: None,
963                capabilities: vec![],
964                pricing: None,
965                deprecated: false,
966                deprecation_note: None,
967                superseded_by: None,
968                fast_mode: None,
969                quality_tags: Vec::new(),
970                availability: crate::llm_config::ModelAvailability::default(),
971                tier: None,
972                open_weight: None,
973                strengths: Vec::new(),
974                benchmarks: std::collections::BTreeMap::new(),
975                family: None,
976                lineage: None,
977                complementary_with: Vec::new(),
978                avoid_as_reviewer_for: Vec::new(),
979            },
980        );
981        crate::llm_config::set_user_overrides(Some(overlay));
982
983        let mut body = serde_json::json!({
984            "model": "qwen-test",
985            "options": {"temperature": 0.1}
986        });
987        apply_ollama_runtime_settings(&mut body, None);
988        assert_eq!(body["options"]["num_ctx"], serde_json::json!(32768));
989        assert_eq!(body["options"]["temperature"], serde_json::json!(0.1));
990
991        crate::llm_config::clear_user_overrides();
992    }
993
994    #[test]
995    fn ollama_keep_alive_normalization_handles_default_and_numbers() {
996        assert_eq!(
997            normalize_ollama_keep_alive("default"),
998            Some(serde_json::json!("30m"))
999        );
1000        assert_eq!(
1001            normalize_ollama_keep_alive("forever"),
1002            Some(serde_json::json!(-1))
1003        );
1004        assert_eq!(
1005            normalize_ollama_keep_alive("120"),
1006            Some(serde_json::json!(120))
1007        );
1008        assert_eq!(
1009            normalize_ollama_keep_alive("10m"),
1010            Some(serde_json::json!("10m"))
1011        );
1012        assert_eq!(normalize_ollama_keep_alive("   "), None);
1013    }
1014
1015    fn readiness_options(model: &str, base_url: String) -> OllamaReadinessOptions {
1016        OllamaReadinessOptions {
1017            model: model.to_string(),
1018            base_url: Some(base_url),
1019            warm: false,
1020            keep_alive: None,
1021            tags_timeout: Duration::from_secs(2),
1022            warmup_timeout: Duration::from_secs(2),
1023            observe_loaded: false,
1024        }
1025    }
1026
1027    #[test]
1028    fn ollama_readiness_verifies_model_and_warms_matched_tag() {
1029        let _guard = env_guard();
1030        let _env = [
1031            ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "65536"),
1032            ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
1033            ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
1034            ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
1035            ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
1036        ];
1037        let captured = Arc::new(Mutex::new(Vec::new()));
1038        let (addr, server) = spawn_stub(
1039            vec![
1040                (
1041                    200,
1042                    r#"{"models":[{"name":"qwen3:latest"},{"name":"llama3.2:latest"}]}"#,
1043                ),
1044                (200, r#"{"response":"","done":true}"#),
1045            ],
1046            captured.clone(),
1047        );
1048
1049        let result = tokio::runtime::Runtime::new()
1050            .expect("runtime")
1051            .block_on(ollama_readiness(OllamaReadinessOptions {
1052                warm: true,
1053                keep_alive: Some(serde_json::json!(-1)),
1054                ..readiness_options("qwen3", format!("http://{addr}"))
1055            }));
1056
1057        server.join().expect("stub server");
1058        assert!(result.valid, "result was: {result:?}");
1059        assert_eq!(result.status, "ok");
1060        assert_eq!(result.matched_model.as_deref(), Some("qwen3:latest"));
1061        assert!(result.warmup.as_ref().is_some_and(|warm| warm.valid));
1062        let expected = result.expected.as_ref().expect("expected request");
1063        assert_eq!(expected.num_ctx, 65_536);
1064        assert_eq!(expected.keep_alive, serde_json::json!(-1));
1065
1066        let requests = captured.lock().expect("captured requests");
1067        assert!(requests[0].starts_with("GET /api/tags "));
1068        assert!(requests[1].starts_with("POST /api/generate "));
1069        let body = requests[1].split("\r\n\r\n").nth(1).unwrap_or("");
1070        let json: serde_json::Value = serde_json::from_str(body).expect("warmup body");
1071        assert_eq!(json["model"], "qwen3:latest");
1072        assert_eq!(json["prompt"], "");
1073        assert_eq!(json["stream"], false);
1074        assert_eq!(json["keep_alive"], -1);
1075        assert_eq!(
1076            json["options"]["num_ctx"], 65_536,
1077            "warmup must inject num_ctx so Ollama loads the runner at the requested context — see issue #1600"
1078        );
1079    }
1080
1081    #[test]
1082    fn ollama_readiness_reports_missing_model_with_available_tags() {
1083        let captured = Arc::new(Mutex::new(Vec::new()));
1084        let (addr, server) = spawn_stub(
1085            vec![(200, r#"{"models":[{"name":"llama3.2:latest"}]}"#)],
1086            captured,
1087        );
1088
1089        let result = tokio::runtime::Runtime::new()
1090            .expect("runtime")
1091            .block_on(ollama_readiness(readiness_options(
1092                "qwen3",
1093                format!("http://{addr}"),
1094            )));
1095
1096        server.join().expect("stub server");
1097        assert!(!result.valid);
1098        assert_eq!(result.status, "model_missing");
1099        assert_eq!(result.available_models, vec!["llama3.2:latest"]);
1100        assert!(result.message.contains("qwen3"));
1101    }
1102
1103    #[test]
1104    fn ollama_readiness_observes_loaded_runner_and_reports_no_drift() {
1105        let _guard = env_guard();
1106        let _env = [
1107            ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "32768"),
1108            ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
1109            ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
1110            ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
1111            ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
1112        ];
1113        let captured = Arc::new(Mutex::new(Vec::new()));
1114        let (addr, server) = spawn_stub(
1115            vec![
1116                (200, r#"{"models":[{"name":"qwen3:latest"}]}"#),
1117                (
1118                    200,
1119                    r#"{"models":[{"name":"qwen3:latest","model":"qwen3:latest","context_length":32768,"size_vram":1234,"size":4321,"expires_at":"2026-05-13T12:00:00Z"}]}"#,
1120                ),
1121            ],
1122            captured.clone(),
1123        );
1124
1125        let result = tokio::runtime::Runtime::new()
1126            .expect("runtime")
1127            .block_on(ollama_readiness(OllamaReadinessOptions {
1128                observe_loaded: true,
1129                ..readiness_options("qwen3", format!("http://{addr}"))
1130            }));
1131
1132        server.join().expect("stub server");
1133        assert!(result.valid, "result was: {result:?}");
1134        let runner = result.loaded_runner.expect("loaded runner present");
1135        assert_eq!(runner.context_length, Some(32_768));
1136        assert_eq!(runner.size_vram, Some(1234));
1137        assert!(
1138            result.context_drift.is_none(),
1139            "no drift expected; got {:?}",
1140            result.context_drift
1141        );
1142
1143        let requests = captured.lock().expect("captured requests");
1144        assert!(requests[0].starts_with("GET /api/tags "));
1145        assert!(requests[1].starts_with("GET /api/ps "));
1146    }
1147
1148    #[test]
1149    fn ollama_readiness_flags_context_drift_when_loaded_exceeds_expected() {
1150        let _guard = env_guard();
1151        let _env = [
1152            ScopedEnvVar::set("HARN_OLLAMA_NUM_CTX", "32768"),
1153            ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
1154            ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
1155            ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
1156            ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
1157        ];
1158        let (addr, server) = spawn_stub(
1159            vec![
1160                (200, r#"{"models":[{"name":"devstral-small-2:24b"}]}"#),
1161                (
1162                    200,
1163                    r#"{"models":[{"name":"devstral-small-2:24b","model":"devstral-small-2:24b","context_length":262144}]}"#,
1164                ),
1165            ],
1166            Arc::new(Mutex::new(Vec::new())),
1167        );
1168
1169        let result = tokio::runtime::Runtime::new()
1170            .expect("runtime")
1171            .block_on(ollama_readiness(OllamaReadinessOptions {
1172                observe_loaded: true,
1173                ..readiness_options("devstral-small-2:24b", format!("http://{addr}"))
1174            }));
1175
1176        server.join().expect("stub server");
1177        assert!(result.valid, "result was: {result:?}");
1178        let drift = result.context_drift.expect("drift expected");
1179        assert!(drift.contains("262144"), "drift message: {drift}");
1180        assert!(drift.contains("32768"), "drift message: {drift}");
1181        assert!(
1182            drift.contains("ollama stop"),
1183            "drift message must teach the user how to recover: {drift}"
1184        );
1185    }
1186
1187    #[test]
1188    fn ollama_readiness_uses_alias_resolved_runtime_settings() {
1189        let _guard = env_guard();
1190        let _env = [
1191            ScopedEnvVar::remove("HARN_OLLAMA_NUM_CTX"),
1192            ScopedEnvVar::remove("OLLAMA_CONTEXT_LENGTH"),
1193            ScopedEnvVar::remove("OLLAMA_NUM_CTX"),
1194            ScopedEnvVar::remove("HARN_OLLAMA_KEEP_ALIVE"),
1195            ScopedEnvVar::remove("OLLAMA_KEEP_ALIVE"),
1196        ];
1197        crate::llm_config::clear_user_overrides();
1198        let mut overlay = crate::llm_config::ProvidersConfig::default();
1199        overlay.aliases.insert(
1200            "devstral-small-2".to_string(),
1201            crate::llm_config::AliasDef {
1202                id: "devstral-small-2:24b".to_string(),
1203                provider: "ollama".to_string(),
1204                tool_format: None,
1205            },
1206        );
1207        overlay.models.insert(
1208            "devstral-small-2:24b".to_string(),
1209            crate::llm_config::ModelDef {
1210                name: "Devstral Small 2 24B".to_string(),
1211                provider: "ollama".to_string(),
1212                context_window: 262_144,
1213                logical_model: None,
1214                equivalence_group: None,
1215                served_variant: None,
1216                wire_model: None,
1217                api_dialect: None,
1218                rate_limits: None,
1219                performance: None,
1220                architecture: None,
1221                local_memory: None,
1222                runtime_context_window: Some(98_304),
1223                stream_timeout: None,
1224                capabilities: vec![],
1225                pricing: None,
1226                deprecated: false,
1227                deprecation_note: None,
1228                superseded_by: None,
1229                fast_mode: None,
1230                quality_tags: Vec::new(),
1231                availability: crate::llm_config::ModelAvailability::default(),
1232                tier: None,
1233                open_weight: None,
1234                strengths: Vec::new(),
1235                benchmarks: std::collections::BTreeMap::new(),
1236                family: None,
1237                lineage: None,
1238                complementary_with: Vec::new(),
1239                avoid_as_reviewer_for: Vec::new(),
1240            },
1241        );
1242        crate::llm_config::set_user_overrides(Some(overlay));
1243
1244        let (resolved, _) = crate::llm_config::resolve_model("devstral-small-2");
1245        assert_eq!(resolved, "devstral-small-2:24b");
1246
1247        let captured = Arc::new(Mutex::new(Vec::new()));
1248        let (addr, server) = spawn_stub(
1249            vec![
1250                (200, r#"{"models":[{"name":"devstral-small-2:24b"}]}"#),
1251                (200, r#"{"response":"","done":true}"#),
1252            ],
1253            captured.clone(),
1254        );
1255
1256        let result = tokio::runtime::Runtime::new()
1257            .expect("runtime")
1258            .block_on(ollama_readiness(OllamaReadinessOptions {
1259                warm: true,
1260                ..readiness_options(&resolved, format!("http://{addr}"))
1261            }));
1262
1263        server.join().expect("stub server");
1264        crate::llm_config::clear_user_overrides();
1265
1266        assert!(result.valid, "result was: {result:?}");
1267        let expected = result.expected.expect("expected request populated");
1268        assert_eq!(
1269            expected.num_ctx, 98_304,
1270            "alias-resolved model must pull runtime_context_window from the catalog"
1271        );
1272
1273        let requests = captured.lock().expect("captured requests");
1274        let warmup_body = requests[1].split("\r\n\r\n").nth(1).unwrap_or("");
1275        let json: serde_json::Value = serde_json::from_str(warmup_body).expect("warmup body");
1276        assert_eq!(json["options"]["num_ctx"], 98_304);
1277    }
1278
1279    #[test]
1280    fn fetch_ollama_loaded_runners_parses_optional_fields() {
1281        let captured = Arc::new(Mutex::new(Vec::new()));
1282        let (addr, server) = spawn_stub(
1283            vec![(
1284                200,
1285                r#"{"models":[{"name":"a:latest","model":"a:latest"},{"name":"b:latest","model":"b:latest","context_length":8192,"size_vram":42,"expires_at":"now"}]}"#,
1286            )],
1287            captured,
1288        );
1289
1290        let runners = tokio::runtime::Runtime::new()
1291            .expect("runtime")
1292            .block_on(fetch_ollama_loaded_runners(
1293                &format!("http://{addr}"),
1294                Duration::from_secs(2),
1295            ))
1296            .expect("ps response parses");
1297        server.join().expect("stub server");
1298
1299        assert_eq!(runners.len(), 2);
1300        assert_eq!(runners[0].name, "a:latest");
1301        assert!(runners[0].context_length.is_none());
1302        assert_eq!(runners[1].context_length, Some(8192));
1303        assert_eq!(runners[1].size_vram, Some(42));
1304        assert_eq!(runners[1].expires_at.as_deref(), Some("now"));
1305    }
1306
1307    fn spawn_stub(
1308        responses: Vec<(u16, &'static str)>,
1309        captured: Arc<Mutex<Vec<String>>>,
1310    ) -> (std::net::SocketAddr, std::thread::JoinHandle<()>) {
1311        let listener = TcpListener::bind("127.0.0.1:0").expect("bind ollama stub");
1312        let addr = listener.local_addr().expect("stub addr");
1313        let handle = std::thread::spawn(move || {
1314            for (status, body) in responses {
1315                let (mut stream, _) = listener.accept().expect("accept request");
1316                stream
1317                    .set_read_timeout(Some(Duration::from_secs(2)))
1318                    .expect("read timeout");
1319                let request = read_http_request(&mut stream);
1320                captured.lock().expect("captured").push(request);
1321                let reason = if status == 200 { "OK" } else { "ERROR" };
1322                let response = format!(
1323                    "HTTP/1.1 {status} {reason}\r\ncontent-type: application/json\r\ncontent-length: {}\r\nconnection: close\r\n\r\n{body}",
1324                    body.len()
1325                );
1326                stream
1327                    .write_all(response.as_bytes())
1328                    .expect("write response");
1329            }
1330        });
1331        (addr, handle)
1332    }
1333
1334    fn read_http_request(stream: &mut std::net::TcpStream) -> String {
1335        let mut data = Vec::new();
1336        let mut buf = [0_u8; 512];
1337        loop {
1338            let n = stream.read(&mut buf).expect("read request");
1339            if n == 0 {
1340                break;
1341            }
1342            data.extend_from_slice(&buf[..n]);
1343            let text = String::from_utf8_lossy(&data);
1344            if let Some(header_end) = text.find("\r\n\r\n") {
1345                let headers = &text[..header_end];
1346                let content_length = match http_content_length_from_header_lines(
1347                    headers.lines(),
1348                    TEST_HTTP_MAX_BODY_BYTES,
1349                ) {
1350                    Ok(content_length) => content_length,
1351                    Err(_) => break,
1352                };
1353                let Some(body_end) = header_end
1354                    .checked_add(4)
1355                    .and_then(|body_start| body_start.checked_add(content_length))
1356                else {
1357                    break;
1358                };
1359                if data.len() >= body_end {
1360                    break;
1361                }
1362            }
1363        }
1364        String::from_utf8(data).expect("utf8 request")
1365    }
1366}