llama_server/
lib.rs

1//! Download, embed, and run llama.cpp in your Rust projects.
2pub mod backend;
3
4mod artifact;
5mod build;
6mod cache;
7mod error;
8mod http;
9
10pub use artifact::Artifact;
11pub use backend::Backend;
12pub use build::Build;
13pub use error::Error;
14pub use http::Progress;
15
16use crate::cache::Cache;
17
18use sipper::{Sipper, Straw, sipper};
19use tokio::process;
20use tokio::time::{self, Duration};
21
22use std::io;
23use std::path::{Path, PathBuf};
24use std::process::Stdio;
25
26/// A server instance.
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub struct Server {
29    /// The specific [`Build`] of the [`Server`].
30    pub build: Build,
31    /// The available backends of the [`Server`].
32    pub backends: backend::Set,
33    /// The path to the executable binary of the [`Server`].
34    pub executable: PathBuf,
35}
36
37impl Server {
38    /// Lists all the [`Server`] builds installed in the system.
39    pub async fn list() -> Result<Vec<Build>, Error> {
40        let mut builds: Vec<_> = Cache::list().await?.iter().map(Cache::build).collect();
41
42        builds.sort();
43
44        Ok(builds)
45    }
46
47    /// Download and installs the given [`Build`] of a [`Server`] with the given backends.
48    pub fn download(build: Build, backends: backend::Set) -> impl Straw<Self, Download, Error> {
49        sipper(async move |sender| {
50            let cache = Cache::new(build);
51
52            let artifacts = [Artifact::Server]
53                .into_iter()
54                .chain(backends.available().map(Artifact::Backend));
55
56            let mut components = Vec::new();
57
58            for artifact in artifacts {
59                let component = cache
60                    .download(artifact)
61                    .with(|progress| Download { artifact, progress })
62                    .run(sender.clone())
63                    .await?;
64
65                components.push(component);
66            }
67
68            let executable = cache.link(components).await?;
69
70            Ok(Self {
71                build,
72                backends: backends.normalize(),
73                executable,
74            })
75        })
76    }
77
78    /// Boots an [`Instance`] of the [`Server`] using the given model.
79    pub async fn boot(
80        &self,
81        model: impl AsRef<Path>,
82        settings: Settings,
83    ) -> Result<Instance, Error> {
84        let process = process::Command::new(&self.executable)
85            .args(
86                format!(
87                    "--model {model} --host {host} --port {port} --gpu-layers {gpu_layers} --jinja",
88                    model = model.as_ref().display(),
89                    host = settings.host,
90                    port = settings.port,
91                    gpu_layers = settings.gpu_layers,
92                )
93                .split_whitespace(),
94            )
95            .stdin(settings.stdin)
96            .stdout(settings.stdout)
97            .stderr(settings.stderr)
98            .kill_on_drop(true)
99            .spawn()?;
100
101        Ok(Instance {
102            host: settings.host,
103            port: settings.port,
104            process,
105        })
106    }
107
108    /// Deletes the [`Server`] installation with the given [`Build`].
109    pub async fn delete(build: Build) -> Result<(), Error> {
110        Cache::new(build).delete().await
111    }
112}
113
114/// The configurable options of a new [`Instance`].
115#[derive(Debug)]
116pub struct Settings {
117    /// The host URI that should be listened to by the [`Instance`].
118    pub host: String,
119    /// The host port to the [`Instance`] should be binded to.
120    pub port: u32,
121    /// The amount of layers to run in a GPU backend.
122    pub gpu_layers: u32,
123    /// The standard input stream.
124    pub stdin: Stdio,
125    /// The standard output stream.
126    pub stdout: Stdio,
127    /// The standard error stream.
128    pub stderr: Stdio,
129}
130
131impl Default for Settings {
132    fn default() -> Self {
133        Self {
134            host: "127.0.0.1".to_owned(),
135            port: 8080,
136            gpu_layers: 80,
137            stdin: Stdio::null(),
138            stdout: Stdio::null(),
139            stderr: Stdio::null(),
140        }
141    }
142}
143
144/// An active [`Server`] running a specific language model.
145#[derive(Debug)]
146pub struct Instance {
147    /// The host address of the [`Instance`].
148    pub host: String,
149    /// The host port of the [`Instance`].
150    pub port: u32,
151    /// The process of the [`Instance`].
152    pub process: process::Child,
153}
154
155impl Instance {
156    /// The URL of the [`Instance`].
157    pub fn url(&self) -> String {
158        format!("http://{}:{}", self.host, self.port)
159    }
160
161    /// Waits until the [`Instance`] is warmed up and ready to receive requests.
162    pub async fn wait_until_ready(&mut self) -> Result<(), Error> {
163        loop {
164            if let Some(status) = self.process.try_wait()? {
165                return Err(io::Error::other(format!(
166                    "llama-server exited unexpectedly: {status}"
167                )))?;
168            }
169
170            if let Ok(response) = http::client()
171                .get(format!("{}/health", self.url()))
172                .send()
173                .await
174                && response.error_for_status().is_ok()
175            {
176                break;
177            }
178
179            time::sleep(Duration::from_secs(1)).await;
180        }
181
182        Ok(())
183    }
184}
185
186/// The download state of a [`Server`].
187#[derive(Debug, Clone, Copy, PartialEq, Eq)]
188pub struct Download {
189    /// The [`Artifact`] being downloaded.
190    pub artifact: Artifact,
191    /// The download [`Progress`].
192    pub progress: Progress,
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    use tokio::fs;
200    use tokio::io;
201
202    #[tokio::test]
203    #[ignore]
204    async fn it_works() -> Result<(), Error> {
205        const MODEL_URL: &str = "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-UD-Q2_K_XL.gguf?download=true";
206        const MODEL_FILE: &str = "Qwen3.gguf";
207
208        let is_ci = std::env::var("CI").as_deref() == Ok("true");
209
210        if is_ci {
211            let installed = Server::list().await?;
212            assert!(installed.is_empty());
213        }
214
215        let build = Build::latest().await.unwrap_or(Build::locked(6730));
216        let server = Server::download(build, backend::Set::all()).await?;
217
218        assert_eq!(server.build, build);
219        assert_eq!(
220            server.backends,
221            if cfg!(target_os = "macos") {
222                backend::Set::empty()
223            } else {
224                backend::Set::all()
225            }
226        );
227
228        if !fs::try_exists(MODEL_FILE).await? {
229            let model = fs::File::create(MODEL_FILE).await?;
230            http::download(MODEL_URL, &mut io::BufWriter::new(model)).await?;
231        }
232
233        let mut instance = server
234            .boot(
235                MODEL_FILE,
236                Settings {
237                    stdout: Stdio::inherit(),
238                    stderr: Stdio::inherit(),
239                    ..Settings::default()
240                },
241            )
242            .await?;
243        instance.wait_until_ready().await?;
244        assert_eq!(instance.url(), "http://127.0.0.1:8080");
245
246        if is_ci {
247            drop(instance);
248
249            let installed = Server::list().await?;
250            assert!(installed.len() == 1);
251            assert_eq!(installed.first(), Some(&server.build));
252
253            Server::delete(server.build).await?;
254
255            let installed = Server::list().await?;
256            assert!(installed.is_empty());
257        }
258
259        Ok(())
260    }
261}
llama_server/lib.rs

llama_server/
lib.rs