llmserve 0.0.7

TUI for serving local LLM models. Pick a model, pick a backend, serve it.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
//! Integration tests for llmserve.
//!
//! Tests marked `#[ignore]` require local models and backends — they are
//! skipped in CI and can be run locally with:
//!     cargo test -- --include-ignored
//!
//! The remaining tests use synthetic data and run everywhere.

use std::path::PathBuf;
use std::time::{Duration, Instant};

use llmserve::backends::{Backend, backend_key, detect_backends};
use llmserve::config::{BackendPreset, Config};
use llmserve::models::{DiscoveredModel, ModelFormat, ModelSource, discover_models};
use llmserve::server;

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Pick the smallest GGUF model on disk so the test loads fast.
fn find_smallest_gguf() -> Option<DiscoveredModel> {
    let models = discover_models(&[]);
    models
        .into_iter()
        .filter(|m| m.format == ModelFormat::Gguf && m.source != ModelSource::Ollama)
        .min_by_key(|m| m.size_bytes)
}

/// Pick the smallest MLX model on disk.
fn find_smallest_mlx() -> Option<DiscoveredModel> {
    let models = discover_models(&[]);
    models
        .into_iter()
        .filter(|m| m.format == ModelFormat::Mlx)
        .min_by_key(|m| m.size_bytes)
}

/// Build a test config that uses a specific port and minimal resources.
fn test_config(port: u16) -> Config {
    let mut config = Config::default();
    config.preferred_port = port;
    config.preferred_host = "127.0.0.1".into();
    config.default_ctx_size = 512;

    for (_key, preset) in config.presets.iter_mut() {
        preset.ctx_size = Some(512);
        preset.port = Some(port);
        preset.host = Some("127.0.0.1".into());
    }

    if let Some(preset) = config.presets.get_mut("llama-server") {
        preset.batch_size = Some(256);
        preset.flash_attn = Some(true);
    }

    config
}

/// Find a free port by binding to :0 and reading back what the OS gave us.
fn free_port() -> u16 {
    let listener = std::net::TcpListener::bind("127.0.0.1:0").expect("bind to ephemeral port");
    listener.local_addr().unwrap().port()
}

/// Poll a URL until it returns 200 or the timeout expires.
fn wait_for_ready(url: &str, timeout: Duration) -> Result<(), String> {
    let agent = ureq::Agent::config_builder()
        .timeout_global(Some(Duration::from_millis(500)))
        .build()
        .new_agent();

    let start = Instant::now();
    while start.elapsed() < timeout {
        if let Ok(resp) = agent.get(url).call() {
            if resp.status().as_u16() == 200 {
                return Ok(());
            }
        }
        std::thread::sleep(Duration::from_millis(500));
    }
    Err(format!("Timed out waiting for {url} after {timeout:?}"))
}

/// Verify we can hit the OpenAI-compatible models endpoint.
fn check_openai_models(port: u16) -> Result<(), String> {
    let agent = ureq::Agent::config_builder()
        .timeout_global(Some(Duration::from_secs(5)))
        .build()
        .new_agent();

    let url = format!("http://127.0.0.1:{port}/v1/models");
    match agent.get(&url).call() {
        Ok(resp) if resp.status().as_u16() == 200 => Ok(()),
        Ok(resp) => Err(format!("/v1/models returned {}", resp.status())),
        Err(e) => Err(format!("/v1/models failed: {e}")),
    }
}

/// Verify we can send a trivial completion request to the server.
fn check_completion(port: u16) -> Result<(), String> {
    let agent = ureq::Agent::config_builder()
        .timeout_global(Some(Duration::from_secs(30)))
        .build()
        .new_agent();

    let url = format!("http://127.0.0.1:{port}/v1/chat/completions");
    let body = serde_json::json!({
        "model": "test",
        "messages": [{"role": "user", "content": "Say hi"}],
        "max_tokens": 4
    });

    match agent.post(&url).send_json(&body) {
        Ok(resp) if resp.status().as_u16() == 200 => Ok(()),
        Ok(resp) => Err(format!("/v1/chat/completions returned {}", resp.status())),
        Err(e) => Err(format!("/v1/chat/completions failed: {e}")),
    }
}

// ---------------------------------------------------------------------------
// Tests that require local models (skipped in CI)
// ---------------------------------------------------------------------------

/// Core rotation test: iterate through every detected backend, serve, verify,
/// stop, and move on.
#[test]
#[ignore] // Requires local models and inference backends
fn serve_and_rotate_backends() {
    let backends = detect_backends();
    let gguf_model = find_smallest_gguf();
    let mlx_model = find_smallest_mlx();

    if gguf_model.is_none() && mlx_model.is_none() {
        eprintln!("SKIP: no local models found for integration test");
        return;
    }

    let mut served_count = 0;

    for detected in &backends {
        if !detected.available {
            eprintln!("SKIP backend {}: not available", detected.backend.label());
            continue;
        }

        if detected.backend == Backend::LmStudio {
            eprintln!("SKIP backend LM Studio: externally managed");
            continue;
        }

        let model = match detected.backend {
            Backend::MlxLm => {
                if let Some(ref m) = mlx_model {
                    m.clone()
                } else {
                    eprintln!("SKIP backend MLX: no MLX models found");
                    continue;
                }
            }
            _ => {
                if let Some(ref m) = gguf_model {
                    m.clone()
                } else {
                    eprintln!(
                        "SKIP backend {}: no GGUF models found",
                        detected.backend.label()
                    );
                    continue;
                }
            }
        };

        if detected.backend == Backend::Ollama {
            eprintln!("Testing Ollama API connectivity (daemon already running)...");
            let ollama_url =
                std::env::var("OLLAMA_HOST").unwrap_or_else(|_| "http://localhost:11434".into());
            let agent = ureq::Agent::config_builder()
                .timeout_global(Some(Duration::from_secs(2)))
                .build()
                .new_agent();
            match agent.get(&format!("{ollama_url}/api/tags")).call() {
                Ok(resp) if resp.status().as_u16() == 200 => {
                    eprintln!("  OK: Ollama API reachable");
                    served_count += 1;
                }
                _ => eprintln!("  WARN: Ollama API unreachable despite detection"),
            }
            continue;
        }

        let port = free_port();
        let config = test_config(port);

        eprintln!(
            "Serving {} via {} on port {port}...",
            model.name,
            detected.backend.label()
        );

        let result = server::launch(&model, &detected.backend, &config);
        let mut handle = match result {
            Ok(h) => h,
            Err(e) => {
                eprintln!("  SKIP: launch failed: {e}");
                continue;
            }
        };

        let health_url = match detected.backend {
            Backend::LlamaServer => format!("http://127.0.0.1:{port}/health"),
            Backend::MlxLm => format!("http://127.0.0.1:{port}/v1/models"),
            _ => format!("http://127.0.0.1:{port}/health"),
        };

        let ready = wait_for_ready(&health_url, Duration::from_secs(120));

        if let Err(ref reason) = ready {
            eprintln!("  Server did not become ready: {reason}");
            server::stop(&mut handle);
            if let Some(exit_msg) = server::check_exited(&mut handle) {
                eprintln!("  Server exited: {exit_msg}");
            }
            continue;
        }

        eprintln!("  Server ready on port {port}");

        match check_openai_models(port) {
            Ok(()) => eprintln!("  /v1/models: OK"),
            Err(e) => eprintln!("  /v1/models: {e}"),
        }

        match check_completion(port) {
            Ok(()) => eprintln!("  /v1/chat/completions: OK"),
            Err(e) => eprintln!("  /v1/chat/completions: {e}"),
        }

        server::stop(&mut handle);
        eprintln!("  Stopped.");

        std::thread::sleep(Duration::from_millis(500));

        served_count += 1;
    }

    assert!(
        served_count > 0,
        "Expected at least one backend to be tested, but none were available"
    );
    eprintln!("\nServed and verified {served_count} backend(s).");
}

/// Test that model discovery finds at least the models we know exist.
#[test]
#[ignore] // Requires local model files on disk
fn discover_local_models() {
    let models = discover_models(&[]);
    assert!(
        !models.is_empty(),
        "Expected to discover at least one model"
    );

    for m in &models {
        assert!(!m.name.is_empty(), "Model name should not be empty");
        if m.source != ModelSource::Ollama {
            assert!(
                m.path.exists(),
                "Model path should exist: {}",
                m.path.display()
            );
        }
    }
}

/// Test that we can discover models from a custom extra directory.
#[test]
#[ignore] // Requires ~/.lmstudio/models on disk
fn discover_models_from_extra_dir() {
    let home = dirs::home_dir().unwrap();
    let lmstudio_dir = home.join(".lmstudio").join("models");
    if !lmstudio_dir.is_dir() {
        eprintln!("SKIP: ~/.lmstudio/models not found");
        return;
    }

    let baseline = discover_models(&[]);
    let baseline_count = baseline.len();

    let with_bogus = discover_models(&[PathBuf::from("/tmp/nonexistent_llmserve_test_dir")]);
    assert_eq!(
        with_bogus.len(),
        baseline_count,
        "Nonexistent extra dir should not change model count"
    );
}

/// Test vision model mmproj detection by verifying models that have it.
#[test]
#[ignore] // Requires local models with mmproj files
fn vision_models_detected() {
    let models = discover_models(&[]);
    let vision_models: Vec<_> = models.iter().filter(|m| m.mmproj.is_some()).collect();

    if !vision_models.is_empty() {
        for m in &vision_models {
            let mmproj = m.mmproj.as_ref().unwrap();
            assert!(
                mmproj.exists(),
                "mmproj path should exist: {}",
                mmproj.display()
            );
            assert!(
                mmproj
                    .file_name()
                    .unwrap()
                    .to_string_lossy()
                    .starts_with("mmproj"),
                "mmproj filename should start with 'mmproj'"
            );
        }
    }
}

// ---------------------------------------------------------------------------
// Tests that run everywhere (no local models needed)
// ---------------------------------------------------------------------------

/// Test backend detection returns consistent results.
#[test]
fn backend_detection_is_consistent() {
    let first = detect_backends();
    let second = detect_backends();

    assert_eq!(first.len(), second.len());
    for (a, b) in first.iter().zip(second.iter()) {
        assert_eq!(a.backend, b.backend);
        assert_eq!(
            a.available,
            b.available,
            "Backend {} availability changed between runs",
            a.backend.label()
        );
    }
}

/// Test config presets are correctly applied for each backend.
#[test]
fn config_presets_resolve_per_backend() {
    let port = free_port();
    let config = test_config(port);

    for detected in detect_backends() {
        let key = backend_key(&detected.backend);
        let preset = config.preset_for(key);

        assert_eq!(preset.port, port, "Preset port for {key}");
        assert_eq!(preset.host, "127.0.0.1", "Preset host for {key}");
        assert_eq!(preset.ctx_size, 512, "Preset ctx_size for {key}");
    }
}

/// Test that extra_args from presets are preserved through config roundtrip.
#[test]
fn preset_extra_args_roundtrip() {
    let mut config = Config::default();
    config.presets.insert(
        "llama-server".into(),
        BackendPreset {
            extra_args: vec!["--mlock".into(), "--cont-batching".into()],
            ..Default::default()
        },
    );

    let serialized = toml::to_string_pretty(&config).unwrap();
    let deserialized: Config = toml::from_str(&serialized).unwrap();

    let preset = deserialized.preset_for("llama-server");
    assert_eq!(preset.extra_args, vec!["--mlock", "--cont-batching"]);
}

/// Test that serving on LM Studio backend returns an informative error.
#[test]
fn lmstudio_serve_returns_error() {
    let model = DiscoveredModel {
        name: "test-model".into(),
        path: PathBuf::from("/tmp/fake.gguf"),
        mmproj: None,
        format: ModelFormat::Gguf,
        size_bytes: 0,
        quant: None,
        param_hint: None,
        source: ModelSource::ExtraDir,
    };
    let config = Config::default();
    let result = server::launch(&model, &Backend::LmStudio, &config);
    assert!(result.is_err());
    let err = result.err().unwrap();
    assert!(
        err.contains("LM Studio"),
        "Error should mention LM Studio: {err}"
    );
}

/// Test that model discovery doesn't crash on nonexistent directories.
#[test]
fn discover_models_nonexistent_dir() {
    let models = discover_models(&[PathBuf::from("/tmp/nonexistent_llmserve_test_dir_12345")]);
    // Should not panic — may return 0 or more depending on default scan locations
    let _ = models.len();
}