Skip to main content

vtcode_core/llm/providers/
llamacpp.rs

1use super::common::resolve_model;
2use super::ollama::base_url_to_host_root;
3use super::openai::OpenAIProvider;
4use crate::config::TimeoutsConfig;
5use crate::config::constants::{env_vars, models, urls};
6use crate::config::core::{AnthropicConfig, ModelConfig, PromptCachingConfig};
7use crate::llm::client::LLMClient;
8use crate::llm::error_display;
9use crate::llm::provider::{LLMError, LLMProvider, LLMRequest, LLMResponse, LLMStream, Message};
10use crate::llm::providers::common::override_base_url;
11use crate::utils::http_client;
12use anyhow::{Context, Result};
13use async_trait::async_trait;
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use std::path::Path;
17use std::process::Stdio;
18use std::sync::{Arc, LazyLock, Mutex};
19use std::time::{Duration, Instant};
20use tokio::process::{Child, Command};
21use tokio::sync::{Mutex as AsyncMutex, watch};
22use tokio::time::sleep;
23use url::Url;
24
25const DEFAULT_STARTUP_TIMEOUT_SECONDS: u64 = 60;
26const SERVER_POLL_INTERVAL: Duration = Duration::from_millis(500);
27
28#[derive(Debug, Deserialize, Serialize)]
29struct LlamaCppModelsResponse {
30    data: Vec<LlamaCppModel>,
31}
32
33#[derive(Debug, Deserialize, Serialize)]
34struct LlamaCppModel {
35    id: String,
36}
37
38const LLAMACPP_CONNECTION_ERROR: &str = "llama.cpp is not responding. Install from https://llama.app and either start `llama-server -m /path/to/model.gguf --port 8080` yourself or set LLAMACPP_MODEL_PATH so VT Code can manage startup.";
39
40#[derive(Debug, Clone, PartialEq, Eq)]
41enum ServerPhase {
42    NotStarted,
43    Starting,
44    Ready,
45    Failed,
46}
47
48#[derive(Debug, Clone)]
49struct ServerStatus {
50    phase: ServerPhase,
51    model_id: Option<String>,
52    model_path: Option<String>,
53    error: Option<String>,
54}
55
56impl Default for ServerStatus {
57    fn default() -> Self {
58        Self {
59            phase: ServerPhase::NotStarted,
60            model_id: None,
61            model_path: None,
62            error: None,
63        }
64    }
65}
66
67impl ServerStatus {
68    fn starting(model_path: Option<String>) -> Self {
69        Self {
70            phase: ServerPhase::Starting,
71            model_id: None,
72            model_path,
73            error: None,
74        }
75    }
76
77    fn ready(model_id: String, model_path: Option<String>) -> Self {
78        Self {
79            phase: ServerPhase::Ready,
80            model_id: Some(model_id),
81            model_path,
82            error: None,
83        }
84    }
85
86    fn failed(error: impl Into<String>, model_path: Option<String>) -> Self {
87        Self {
88            phase: ServerPhase::Failed,
89            model_id: None,
90            model_path,
91            error: Some(error.into()),
92        }
93    }
94}
95
96#[derive(Debug)]
97struct ManagedLlamaCppServer {
98    state: AsyncMutex<ManagedLlamaCppState>,
99    status_tx: watch::Sender<ServerStatus>,
100}
101
102#[derive(Debug, Default)]
103struct ManagedLlamaCppState {
104    child: Option<Child>,
105    status: ServerStatus,
106}
107
108impl ManagedLlamaCppServer {
109    fn new() -> Self {
110        let status = ServerStatus::default();
111        let (status_tx, _) = watch::channel(status.clone());
112        Self {
113            state: AsyncMutex::new(ManagedLlamaCppState {
114                child: None,
115                status,
116            }),
117            status_tx,
118        }
119    }
120}
121
122#[derive(Debug)]
123enum ServerProbe {
124    Ready(String),
125    Loading,
126    Unavailable(String),
127}
128
129static MANAGED_LLAMACPP_SERVERS: LazyLock<Mutex<HashMap<String, Arc<ManagedLlamaCppServer>>>> =
130    LazyLock::new(|| Mutex::new(HashMap::new()));
131
132pub async fn fetch_llamacpp_models(base_url: Option<String>) -> Result<Vec<String>, anyhow::Error> {
133    let resolved_base_url = override_base_url(
134        urls::LLAMACPP_API_BASE,
135        base_url,
136        Some(env_vars::LLAMACPP_BASE_URL),
137    );
138    let models_url = format!("{}/models", resolved_base_url.trim_end_matches('/'));
139    let client = http_client::create_client_with_timeout(Duration::from_secs(5));
140    let response = client
141        .get(&models_url)
142        .header("Content-Type", "application/json")
143        .send()
144        .await
145        .map_err(|e| {
146            tracing::warn!("Failed to connect to llama.cpp server: {e:?}");
147            anyhow::anyhow!(LLAMACPP_CONNECTION_ERROR)
148        })?;
149
150    if !response.status().is_success() {
151        return Err(anyhow::anyhow!(
152            "Failed to fetch llama.cpp models: HTTP {}. {}",
153            response.status(),
154            if response.status() == reqwest::StatusCode::NOT_FOUND {
155                "Ensure llama-server is running and exposing the OpenAI-compatible /v1 API."
156            } else {
157                ""
158            }
159        ));
160    }
161
162    let models_response: LlamaCppModelsResponse = response
163        .json()
164        .await
165        .map_err(|e| anyhow::anyhow!("Failed to parse llama.cpp models response: {}", e))?;
166
167    Ok(models_response
168        .data
169        .into_iter()
170        .map(|model| model.id)
171        .collect())
172}
173
174pub struct LlamaCppProvider {
175    inner: OpenAIProvider,
176    api_key: Option<String>,
177    configured_model: Option<String>,
178    base_url: String,
179    prompt_cache: Option<PromptCachingConfig>,
180    timeouts: Option<TimeoutsConfig>,
181    anthropic: Option<AnthropicConfig>,
182    model_behavior: Option<ModelConfig>,
183}
184
185impl LlamaCppProvider {
186    fn resolve_base_url(base_url: Option<String>) -> String {
187        override_base_url(
188            urls::LLAMACPP_API_BASE,
189            base_url,
190            Some(env_vars::LLAMACPP_BASE_URL),
191        )
192    }
193
194    fn build_inner(
195        api_key: Option<String>,
196        model: Option<String>,
197        base_url: Option<String>,
198        prompt_cache: Option<PromptCachingConfig>,
199        timeouts: Option<TimeoutsConfig>,
200        anthropic: Option<AnthropicConfig>,
201        model_behavior: Option<ModelConfig>,
202    ) -> OpenAIProvider {
203        let resolved_model = resolve_model(model, models::llamacpp::DEFAULT_MODEL);
204        let resolved_base = Self::resolve_base_url(base_url);
205        OpenAIProvider::from_config(
206            api_key,
207            None,
208            Some(resolved_model),
209            Some(resolved_base),
210            prompt_cache,
211            timeouts,
212            anthropic,
213            None,
214            model_behavior,
215        )
216    }
217
218    fn managed_server_for(base_url: &str) -> Arc<ManagedLlamaCppServer> {
219        let host_root = base_url_to_host_root(base_url);
220        let mut guard = MANAGED_LLAMACPP_SERVERS
221            .lock()
222            .expect("llama.cpp managed server map poisoned");
223        guard
224            .entry(host_root)
225            .or_insert_with(|| Arc::new(ManagedLlamaCppServer::new()))
226            .clone()
227    }
228
229    fn provider_error(message: impl Into<String>) -> LLMError {
230        LLMError::Provider {
231            message: error_display::format_llm_error("llama.cpp", &message.into()),
232            metadata: None,
233        }
234    }
235
236    fn configured_startup_model_path(configured_model: Option<&str>) -> Option<String> {
237        std::env::var(env_vars::LLAMACPP_MODEL_PATH)
238            .ok()
239            .filter(|value| !value.trim().is_empty())
240            .or_else(|| {
241                configured_model.and_then(|value| {
242                    let trimmed = value.trim();
243                    if trimmed.is_empty() || !Self::looks_like_local_model_path(trimmed) {
244                        return None;
245                    }
246                    Some(trimmed.to_string())
247                })
248            })
249    }
250
251    fn startup_timeout() -> Duration {
252        std::env::var(env_vars::LLAMACPP_STARTUP_TIMEOUT_SECONDS)
253            .ok()
254            .and_then(|value| value.parse::<u64>().ok())
255            .filter(|seconds| *seconds > 0)
256            .map(Duration::from_secs)
257            .unwrap_or_else(|| Duration::from_secs(DEFAULT_STARTUP_TIMEOUT_SECONDS))
258    }
259
260    fn looks_like_local_model_path(value: &str) -> bool {
261        let trimmed = value.trim();
262        if trimmed.is_empty() {
263            return false;
264        }
265
266        trimmed.ends_with(".gguf")
267            || trimmed.contains(std::path::MAIN_SEPARATOR)
268            || trimmed.contains('/')
269            || trimmed.starts_with('.')
270            || Path::new(trimmed).exists()
271    }
272
273    fn is_local_base_url(base_url: &str) -> bool {
274        let host_root = base_url_to_host_root(base_url);
275        Url::parse(&host_root)
276            .ok()
277            .and_then(|url| url.host_str().map(str::to_ascii_lowercase))
278            .is_some_and(|host| host == "localhost" || host == "127.0.0.1" || host == "::1")
279    }
280
281    fn host_port(base_url: &str) -> Result<u16> {
282        let host_root = base_url_to_host_root(base_url);
283        let parsed = Url::parse(&host_root)
284            .with_context(|| format!("Failed to parse llama.cpp base URL: {host_root}"))?;
285        Ok(parsed.port().unwrap_or(8080))
286    }
287
288    fn resolve_binary_path() -> Result<String> {
289        if let Ok(path) = std::env::var(env_vars::LLAMACPP_BINARY_PATH)
290            && !path.trim().is_empty()
291        {
292            return Ok(path);
293        }
294
295        which::which("llama-server")
296            .map(|path| path.to_string_lossy().into_owned())
297            .context("Could not find `llama-server` on PATH. Install llama.cpp from https://llama.app or set LLAMACPP_BINARY_PATH.")
298    }
299
300    fn build_command_args(base_url: &str, model_path: &str) -> Result<Vec<String>> {
301        let path = Path::new(model_path);
302        if !path.exists() {
303            anyhow::bail!("Configured model path does not exist: {model_path}");
304        }
305
306        let mut args = vec![
307            "-m".to_string(),
308            model_path.to_string(),
309            "--port".to_string(),
310            Self::host_port(base_url)?.to_string(),
311        ];
312
313        if let Ok(extra_args) = std::env::var(env_vars::LLAMACPP_EXTRA_ARGS)
314            && !extra_args.trim().is_empty()
315        {
316            args.extend(shell_words::split(&extra_args).with_context(|| {
317                format!(
318                    "Failed to parse {}: {extra_args}",
319                    env_vars::LLAMACPP_EXTRA_ARGS
320                )
321            })?);
322        }
323
324        Ok(args)
325    }
326
327    async fn spawn_managed_server(base_url: &str, model_path: &str) -> Result<Child> {
328        let binary = Self::resolve_binary_path()?;
329        let args = Self::build_command_args(base_url, model_path)?;
330        let mut command = Command::new(&binary);
331        command
332            .args(&args)
333            .stdin(Stdio::null())
334            .stdout(Stdio::null())
335            .stderr(Stdio::null())
336            .kill_on_drop(true);
337
338        command.spawn().with_context(|| {
339            format!(
340                "Failed to start llama.cpp server with `{binary} {}`",
341                args.join(" ")
342            )
343        })
344    }
345
346    async fn probe_server(base_url: &str) -> ServerProbe {
347        let host_root = base_url_to_host_root(base_url);
348        let health_url = format!("{}/health", host_root.trim_end_matches('/'));
349        let client = http_client::create_client_with_timeout(Duration::from_secs(5));
350
351        let response = match client.get(&health_url).send().await {
352            Ok(response) => response,
353            Err(error) => {
354                tracing::debug!("llama.cpp health probe failed for {health_url}: {error}");
355                return ServerProbe::Unavailable(LLAMACPP_CONNECTION_ERROR.to_string());
356            }
357        };
358
359        if response.status().is_success() {
360            return match fetch_llamacpp_models(Some(base_url.to_string())).await {
361                Ok(models) if !models.is_empty() => ServerProbe::Ready(models[0].clone()),
362                Ok(_) => ServerProbe::Unavailable(
363                    "llama.cpp is running but did not report any loaded models from /v1/models."
364                        .to_string(),
365                ),
366                Err(error) => ServerProbe::Unavailable(error.to_string()),
367            };
368        }
369
370        let status = response.status();
371        let body = response.text().await.unwrap_or_default();
372        if status == reqwest::StatusCode::SERVICE_UNAVAILABLE
373            && body.to_ascii_lowercase().contains("loading")
374        {
375            return ServerProbe::Loading;
376        }
377
378        if status == reqwest::StatusCode::NOT_FOUND {
379            return match fetch_llamacpp_models(Some(base_url.to_string())).await {
380                Ok(models) if !models.is_empty() => ServerProbe::Ready(models[0].clone()),
381                Ok(_) => ServerProbe::Unavailable(
382                    "llama.cpp is running but did not report any loaded models from /v1/models."
383                        .to_string(),
384                ),
385                Err(error) => ServerProbe::Unavailable(error.to_string()),
386            };
387        }
388
389        ServerProbe::Unavailable(format!(
390            "llama.cpp health check failed with HTTP {}{}",
391            status,
392            if body.trim().is_empty() {
393                String::new()
394            } else {
395                format!(": {}", body.trim())
396            }
397        ))
398    }
399
400    async fn wait_until_ready(base_url: &str, timeout: Duration) -> Result<String> {
401        let deadline = Instant::now() + timeout;
402        let mut last_error = LLAMACPP_CONNECTION_ERROR.to_string();
403
404        while Instant::now() < deadline {
405            match Self::probe_server(base_url).await {
406                ServerProbe::Ready(model_id) => return Ok(model_id),
407                ServerProbe::Loading => {
408                    last_error = "llama.cpp is still loading the configured model".to_string();
409                }
410                ServerProbe::Unavailable(message) => {
411                    last_error = message;
412                }
413            }
414
415            sleep(SERVER_POLL_INTERVAL).await;
416        }
417
418        Err(anyhow::anyhow!(
419            "Timed out waiting for llama.cpp to become ready after {}s. Last status: {}",
420            timeout.as_secs(),
421            last_error
422        ))
423    }
424
425    async fn ensure_server_ready(&self) -> Result<String, LLMError> {
426        let timeout = Self::startup_timeout();
427        let initial_probe = Self::probe_server(&self.base_url).await;
428        match &initial_probe {
429            ServerProbe::Ready(model_id) => {
430                let server = Self::managed_server_for(&self.base_url);
431                let mut state = server.state.lock().await;
432                state.status =
433                    ServerStatus::ready(model_id.clone(), state.status.model_path.clone());
434                let _ = server.status_tx.send(state.status.clone());
435                return Ok(model_id.clone());
436            }
437            ServerProbe::Loading => {}
438            ServerProbe::Unavailable(_) => {}
439        }
440
441        let startup_model_path =
442            Self::configured_startup_model_path(self.configured_model.as_deref());
443        let server = Self::managed_server_for(&self.base_url);
444        let mut rx = server.status_tx.subscribe();
445
446        loop {
447            let mut should_spawn = false;
448            {
449                let mut state = server.state.lock().await;
450
451                if let Some(child) = state.child.as_mut() {
452                    match child.try_wait() {
453                        Ok(Some(exit_status)) => {
454                            let model_path = state.status.model_path.clone();
455                            state.child = None;
456                            state.status = ServerStatus::failed(
457                                format!(
458                                    "Managed llama.cpp server exited with status {exit_status}"
459                                ),
460                                model_path,
461                            );
462                            let _ = server.status_tx.send(state.status.clone());
463                        }
464                        Ok(None) => {}
465                        Err(error) => {
466                            let model_path = state.status.model_path.clone();
467                            state.child = None;
468                            state.status = ServerStatus::failed(
469                                format!("Failed to inspect managed llama.cpp server: {error}"),
470                                model_path,
471                            );
472                            let _ = server.status_tx.send(state.status.clone());
473                        }
474                    }
475                }
476
477                match state.status.phase {
478                    ServerPhase::Ready => {
479                        if let Some(model_id) = state.status.model_id.clone() {
480                            return Ok(model_id);
481                        }
482                        state.status = ServerStatus::default();
483                        let _ = server.status_tx.send(state.status.clone());
484                    }
485                    ServerPhase::Starting => {}
486                    ServerPhase::NotStarted | ServerPhase::Failed => {
487                        match startup_model_path.clone() {
488                            Some(model_path) => {
489                                if !Self::is_local_base_url(&self.base_url) {
490                                    return Err(Self::provider_error(format!(
491                                        "{} Auto-start is only available for localhost llama.cpp endpoints.",
492                                        LLAMACPP_CONNECTION_ERROR
493                                    )));
494                                }
495                                state.status = ServerStatus::starting(Some(model_path));
496                                let _ = server.status_tx.send(state.status.clone());
497                                should_spawn = true;
498                            }
499                            None => {
500                                let reason = match &initial_probe {
501                                    ServerProbe::Unavailable(message) => message.clone(),
502                                    ServerProbe::Loading => {
503                                        "llama.cpp is still loading but no managed model path is configured"
504                                            .to_string()
505                                    }
506                                    ServerProbe::Ready(model_id) => return Ok(model_id.clone()),
507                                };
508                                return Err(Self::provider_error(format!(
509                                    "{reason} Set {} or configure the provider model to a local .gguf path so VT Code can launch llama-server automatically.",
510                                    env_vars::LLAMACPP_MODEL_PATH
511                                )));
512                            }
513                        }
514                    }
515                }
516            }
517
518            if should_spawn {
519                let model_path = startup_model_path.clone().ok_or_else(|| {
520                    Self::provider_error(format!(
521                        "Managed llama.cpp startup requires {} or a provider model path",
522                        env_vars::LLAMACPP_MODEL_PATH
523                    ))
524                })?;
525
526                let spawn_result = async {
527                    let child = Self::spawn_managed_server(&self.base_url, &model_path).await?;
528                    let model_id = Self::wait_until_ready(&self.base_url, timeout).await?;
529                    Ok::<_, anyhow::Error>((child, model_id))
530                }
531                .await;
532
533                let mut state = server.state.lock().await;
534                match spawn_result {
535                    Ok((child, model_id)) => {
536                        state.child = Some(child);
537                        state.status = ServerStatus::ready(model_id.clone(), Some(model_path));
538                        let _ = server.status_tx.send(state.status.clone());
539                        return Ok(model_id);
540                    }
541                    Err(error) => {
542                        state.child = None;
543                        state.status = ServerStatus::failed(error.to_string(), Some(model_path));
544                        let _ = server.status_tx.send(state.status.clone());
545                        return Err(Self::provider_error(error.to_string()));
546                    }
547                }
548            }
549
550            rx.changed().await.map_err(|_| {
551                Self::provider_error("llama.cpp managed server watcher unexpectedly closed")
552            })?;
553
554            let status = rx.borrow().clone();
555            match status.phase {
556                ServerPhase::Ready => {
557                    if let Some(model_id) = status.model_id {
558                        return Ok(model_id);
559                    }
560                }
561                ServerPhase::Failed => {
562                    return Err(Self::provider_error(
563                        status
564                            .error
565                            .unwrap_or_else(|| LLAMACPP_CONNECTION_ERROR.to_string()),
566                    ));
567                }
568                ServerPhase::Starting | ServerPhase::NotStarted => {}
569            }
570        }
571    }
572
573    fn should_replace_request_model(
574        &self,
575        request_model: &str,
576        discovered_models: &[String],
577    ) -> bool {
578        let trimmed = request_model.trim();
579        if trimmed.is_empty() || Self::looks_like_local_model_path(trimmed) {
580            return true;
581        }
582
583        if discovered_models.len() == 1 {
584            let configured = self
585                .configured_model
586                .as_deref()
587                .map(str::trim)
588                .unwrap_or_default();
589            if trimmed == models::llamacpp::DEFAULT_MODEL || trimmed == configured {
590                return true;
591            }
592        }
593
594        !discovered_models.iter().any(|model| model == trimmed) && discovered_models.len() == 1
595    }
596
597    fn request_model_or_default(&self, request_model: &str) -> String {
598        let trimmed = request_model.trim();
599        if trimmed.is_empty() {
600            resolve_model(
601                self.configured_model.clone(),
602                models::llamacpp::DEFAULT_MODEL,
603            )
604        } else {
605            trimmed.to_string()
606        }
607    }
608
609    fn build_request_provider(&self, model: String) -> OpenAIProvider {
610        Self::build_inner(
611            self.api_key.clone(),
612            Some(model),
613            Some(self.base_url.clone()),
614            self.prompt_cache.clone(),
615            self.timeouts.clone(),
616            self.anthropic.clone(),
617            self.model_behavior.clone(),
618        )
619    }
620
621    async fn prepare_request(
622        &self,
623        mut request: LLMRequest,
624    ) -> Result<(OpenAIProvider, LLMRequest), LLMError> {
625        let discovered_model = self.ensure_server_ready().await?;
626        let discovered_models = vec![discovered_model.clone()];
627
628        if self.should_replace_request_model(&request.model, &discovered_models)
629            || request.model.trim().is_empty()
630        {
631            request.model = discovered_model.clone();
632        } else {
633            request.model = self.request_model_or_default(&request.model);
634        }
635
636        Ok((self.build_request_provider(request.model.clone()), request))
637    }
638
639    pub fn from_config(
640        api_key: Option<String>,
641        model: Option<String>,
642        base_url: Option<String>,
643        prompt_cache: Option<PromptCachingConfig>,
644        timeouts: Option<TimeoutsConfig>,
645        anthropic: Option<AnthropicConfig>,
646        model_behavior: Option<ModelConfig>,
647    ) -> Self {
648        let resolved_base_url = Self::resolve_base_url(base_url.clone());
649        Self {
650            inner: Self::build_inner(
651                api_key.clone(),
652                model.clone(),
653                base_url,
654                prompt_cache.clone(),
655                timeouts.clone(),
656                anthropic.clone(),
657                model_behavior.clone(),
658            ),
659            api_key,
660            configured_model: model,
661            base_url: resolved_base_url,
662            prompt_cache,
663            timeouts,
664            anthropic,
665            model_behavior,
666        }
667    }
668}
669
670#[async_trait]
671impl LLMProvider for LlamaCppProvider {
672    fn name(&self) -> &str {
673        "llamacpp"
674    }
675
676    fn supports_streaming(&self) -> bool {
677        self.inner.supports_streaming()
678    }
679
680    fn supports_reasoning(&self, model: &str) -> bool {
681        self.inner.supports_reasoning(model)
682    }
683
684    fn supports_reasoning_effort(&self, model: &str) -> bool {
685        self.inner.supports_reasoning_effort(model)
686    }
687
688    fn supports_tools(&self, model: &str) -> bool {
689        self.inner.supports_tools(model)
690    }
691
692    fn supports_parallel_tool_config(&self, model: &str) -> bool {
693        self.inner.supports_parallel_tool_config(model)
694    }
695
696    async fn generate(&self, request: LLMRequest) -> Result<LLMResponse, LLMError> {
697        let (provider, request) = self.prepare_request(request).await?;
698        provider.generate(request).await
699    }
700
701    async fn stream(&self, request: LLMRequest) -> Result<LLMStream, LLMError> {
702        let (provider, request) = self.prepare_request(request).await?;
703        provider.stream(request).await
704    }
705
706    fn supported_models(&self) -> Vec<String> {
707        models::llamacpp::SUPPORTED_MODELS
708            .iter()
709            .map(|model| model.to_string())
710            .collect()
711    }
712
713    fn validate_request(&self, request: &LLMRequest) -> Result<(), LLMError> {
714        if request.messages.is_empty() {
715            let formatted_error =
716                error_display::format_llm_error("llama.cpp", "Messages cannot be empty");
717            return Err(LLMError::InvalidRequest {
718                message: formatted_error,
719                metadata: None,
720            });
721        }
722
723        for message in &request.messages {
724            if let Err(err) = message.validate_for_provider("openai") {
725                let formatted = error_display::format_llm_error("llama.cpp", &err);
726                return Err(LLMError::InvalidRequest {
727                    message: formatted,
728                    metadata: None,
729                });
730            }
731        }
732
733        Ok(())
734    }
735}
736
737#[async_trait]
738impl LLMClient for LlamaCppProvider {
739    async fn generate(&mut self, prompt: &str) -> Result<LLMResponse, LLMError> {
740        LLMProvider::generate(
741            self,
742            LLMRequest {
743                messages: vec![Message::user(prompt.to_string())],
744                model: self
745                    .configured_model
746                    .clone()
747                    .unwrap_or_else(|| models::llamacpp::DEFAULT_MODEL.to_string()),
748                ..Default::default()
749            },
750        )
751        .await
752    }
753
754    fn model_id(&self) -> &str {
755        self.inner.model_id()
756    }
757}