ask_llm 2.2.2

make a request to whatever llm is the best these days, without hardcoding model/provider
Documentation
use eyre::{Result, bail};
use serde::{Deserialize, Serialize};

use crate::{Backend, Request, Response, Role, ThinkingLevel};

pub(crate) struct Ollama {
	pub model: String,
	pub url: String,
}
impl Ollama {
	async fn do_conversation(&self, request: &Request<'_>) -> Result<Response> {
		if !request.files.is_empty() {
			bail!("Ollama backend does not support file attachments");
		}

		let mut messages: Vec<OllamaMessage> = Vec::new();

		for message in &request.conversation.0 {
			let role = match message.role {
				Role::System => "system",
				Role::User => "user",
				Role::Assistant => "assistant",
			};
			let text = match &message.content {
				crate::MessageContent::Text(t) => t.clone(),
				_ => bail!("Ollama backend only supports text messages"),
			};
			messages.push(OllamaMessage {
				role: role.to_string(),
				content: text,
			});
		}

		if request.force_json {
			if let Some(last) = messages.last_mut() {
				if last.role == "user" {
					last.content.push_str("\n\nRespond with valid JSON only, no other text.");
				}
			}
		}

		let think = !matches!(request.thinking, ThinkingLevel::None);

		let mut ollama_request = OllamaRequest {
			model: self.model.clone(),
			messages,
			stream: false,
			think,
			options: OllamaOptions {
				temperature: request.temperature.unwrap_or(0.0),
				num_predict: request.max_tokens,
				stop: None,
			},
		};

		if let Some(ref seqs) = request.stop_sequences {
			ollama_request.options.stop = Some(seqs.iter().map(|s| s.to_string()).collect());
		}

		let response = reqwest::Client::new().post(&self.url).json(&ollama_request).send().await?;

		let status = response.status();
		if !status.is_success() {
			let body = response.text().await.unwrap_or_default();
			bail!("Ollama request failed ({status}): {body}");
		}

		let value: serde_json::Value = response.json().await?;
		tracing::debug!(?value);

		let parsed: OllamaResponse = serde_json::from_value(value.clone()).inspect_err(|e| {
			eprintln!(
				"Failed to parse Ollama response: {}\n{e:?}",
				serde_json::to_string_pretty(&value).unwrap_or_else(|_| format!("{:?}", value))
			);
		})?;

		let overhead_nanos = parsed.load_duration + parsed.prompt_eval_duration;
		Ok(Response {
			text: parsed.message.content,
			cost_cents: 0.0,
			duration: std::time::Duration::ZERO,
			overhead: std::time::Duration::from_nanos(overhead_nanos),
			model: self.model.clone(),
			thinking: request.thinking,
		})
	}
}

impl Backend for Ollama {
	fn conversation<'a>(&'a self, request: &'a Request<'a>) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<Response>> + Send + 'a>> {
		Box::pin(self.do_conversation(request))
	}
}

#[derive(Debug, Serialize)]
struct OllamaRequest {
	model: String,
	messages: Vec<OllamaMessage>,
	stream: bool,
	think: bool,
	options: OllamaOptions,
}

#[derive(Debug, Serialize)]
struct OllamaOptions {
	temperature: f32,
	#[serde(skip_serializing_if = "Option::is_none")]
	num_predict: Option<usize>,
	#[serde(skip_serializing_if = "Option::is_none")]
	stop: Option<Vec<String>>,
}

#[derive(Debug, Deserialize, Serialize)]
struct OllamaMessage {
	role: String,
	content: String,
}

#[derive(Debug, Deserialize)]
struct OllamaResponse {
	message: OllamaMessage,
	#[serde(default)]
	load_duration: u64,
	#[serde(default)]
	prompt_eval_duration: u64,
}