1use axum::{extract::State, response::Json};
8use serde::{Deserialize, Serialize};
9
10use super::state::BancoState;
11use crate::serve::templates::ChatMessage;
12
13#[derive(Debug, Clone, Deserialize)]
19pub struct OllamaChatRequest {
20 #[serde(default)]
21 pub model: Option<String>,
22 pub messages: Vec<OllamaMessage>,
23 #[serde(default)]
24 pub stream: bool,
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct OllamaMessage {
30 pub role: String,
31 pub content: String,
32}
33
34#[derive(Debug, Clone, Serialize)]
36pub struct OllamaChatResponse {
37 pub model: String,
38 pub created_at: String,
39 pub message: OllamaMessage,
40 pub done: bool,
41 pub total_duration: u64,
42 pub prompt_eval_count: u32,
43 pub eval_count: u32,
44}
45
46#[derive(Debug, Clone, Serialize)]
48pub struct OllamaTagsResponse {
49 pub models: Vec<OllamaModelInfo>,
50}
51
52#[derive(Debug, Clone, Serialize)]
54pub struct OllamaModelInfo {
55 pub name: String,
56 pub model: String,
57 pub size: u64,
58 pub digest: String,
59}
60
61#[derive(Debug, Clone, Deserialize)]
63pub struct OllamaShowRequest {
64 pub name: String,
65}
66
67#[derive(Debug, Clone, Serialize)]
69pub struct OllamaShowResponse {
70 pub modelfile: String,
71 pub parameters: String,
72 pub template: String,
73}
74
75pub async fn ollama_chat_handler(
81 State(state): State<BancoState>,
82 Json(request): Json<OllamaChatRequest>,
83) -> Json<OllamaChatResponse> {
84 let model = request.model.unwrap_or_else(|| "banco-echo".to_string());
85
86 let messages: Vec<ChatMessage> = request
88 .messages
89 .iter()
90 .map(|m| match m.role.as_str() {
91 "system" => ChatMessage::system(&m.content),
92 "assistant" => ChatMessage::assistant(&m.content),
93 _ => ChatMessage::user(&m.content),
94 })
95 .collect();
96
97 let prompt_tokens = state.context_manager.estimate_tokens(&messages) as u32;
98 let (content, eval_count) = generate_ollama_response(&state, &messages);
99
100 Json(OllamaChatResponse {
101 model,
102 created_at: chrono::Utc::now().to_rfc3339(),
103 message: OllamaMessage { role: "assistant".to_string(), content },
104 done: true,
105 total_duration: 0,
106 prompt_eval_count: prompt_tokens,
107 eval_count,
108 })
109}
110
111#[derive(Debug, Clone, Deserialize)]
113pub struct OllamaGenerateRequest {
114 #[serde(default)]
115 pub model: Option<String>,
116 pub prompt: String,
117 #[serde(default)]
118 pub system: Option<String>,
119 #[serde(default)]
120 pub stream: bool,
121}
122
123#[derive(Debug, Clone, Serialize)]
125pub struct OllamaGenerateResponse {
126 pub model: String,
127 pub created_at: String,
128 pub response: String,
129 pub done: bool,
130 pub total_duration: u64,
131 pub prompt_eval_count: u32,
132 pub eval_count: u32,
133}
134
135pub async fn ollama_generate_handler(
137 State(state): State<BancoState>,
138 Json(request): Json<OllamaGenerateRequest>,
139) -> Json<OllamaGenerateResponse> {
140 let model = request.model.unwrap_or_else(|| "banco-echo".to_string());
141
142 let mut messages = Vec::new();
144 if let Some(system) = &request.system {
145 messages.push(ChatMessage::system(system));
146 }
147 messages.push(ChatMessage::user(&request.prompt));
148
149 let prompt_tokens = state.context_manager.estimate_tokens(&messages) as u32;
150 let (content, eval_count) = generate_ollama_response(&state, &messages);
151
152 Json(OllamaGenerateResponse {
153 model,
154 created_at: chrono::Utc::now().to_rfc3339(),
155 response: content,
156 done: true,
157 total_duration: 0,
158 prompt_eval_count: prompt_tokens,
159 eval_count,
160 })
161}
162
163fn generate_ollama_response(state: &BancoState, messages: &[ChatMessage]) -> (String, u32) {
165 #[cfg(feature = "realizar")]
167 if let Some(model) = state.model.quantized_model() {
168 let vocab = state.model.vocabulary();
169 if !vocab.is_empty() {
170 let formatted = state.template_engine.apply(messages);
171 let prompt_tokens = state.model.encode_text(&formatted);
172 if !prompt_tokens.is_empty() {
173 let server_params = state.inference_params.read().ok();
174 let params = super::inference::SamplingParams {
175 temperature: server_params.as_ref().map(|p| p.temperature).unwrap_or(0.7),
176 top_k: server_params.as_ref().map(|p| p.top_k).unwrap_or(40),
177 max_tokens: server_params.as_ref().map(|p| p.max_tokens).unwrap_or(256),
178 };
179 drop(server_params);
180 if let Ok(result) =
181 super::inference::generate_sync(&model, &vocab, &prompt_tokens, ¶ms)
182 {
183 return (result.text, result.token_count);
184 }
185 }
186 }
187 }
188
189 let _ = (state, messages);
191 let content = "No model loaded. Load a GGUF model to enable inference:\n\
192 curl -X POST http://localhost:8090/api/v1/models/load -d '{\"model\": \"./model.gguf\"}'"
193 .to_string();
194 let eval_count = (content.len() / 4) as u32;
195 (content, eval_count)
196}
197
198pub async fn ollama_tags_handler(State(state): State<BancoState>) -> Json<OllamaTagsResponse> {
200 let backends = state.backend_selector.recommend();
201 let models = backends
202 .iter()
203 .map(|b| {
204 let name = format!("{b:?}").to_lowercase();
205 OllamaModelInfo { name: name.clone(), model: name, size: 0, digest: String::new() }
206 })
207 .collect();
208 Json(OllamaTagsResponse { models })
209}
210
211pub async fn ollama_show_handler(
213 Json(request): Json<OllamaShowRequest>,
214) -> Json<OllamaShowResponse> {
215 Json(OllamaShowResponse {
216 modelfile: format!("FROM {}", request.name),
217 parameters: "temperature 0.7\ntop_p 1.0".to_string(),
218 template: "{{ .System }}\n{{ .Prompt }}".to_string(),
219 })
220}
221
222pub async fn ollama_pull_handler(
224 State(state): State<BancoState>,
225 Json(request): Json<OllamaPullRequest>,
226) -> Json<OllamaPullResponse> {
227 state.events.emit(&super::events::BancoEvent::SystemEvent {
229 message: format!("Ollama pull: {}", request.name),
230 });
231
232 Json(OllamaPullResponse {
233 status: "success".to_string(),
234 digest: format!("sha256:{:x}", fxhash(&request.name)),
235 total: 0,
236 completed: 0,
237 })
238}
239
240pub async fn ollama_delete_handler(
242 State(state): State<BancoState>,
243 Json(request): Json<OllamaDeleteRequest>,
244) -> axum::http::StatusCode {
245 let _ = request.name;
246 let _ = state.model.unload();
247 state.events.emit(&super::events::BancoEvent::ModelUnloaded);
248 axum::http::StatusCode::OK
249}
250
251#[derive(Debug, Deserialize)]
253pub struct OllamaPullRequest {
254 pub name: String,
255 #[serde(default)]
256 pub insecure: bool,
257 #[serde(default)]
258 pub stream: bool,
259}
260
261#[derive(Debug, Serialize)]
263pub struct OllamaPullResponse {
264 pub status: String,
265 pub digest: String,
266 pub total: u64,
267 pub completed: u64,
268}
269
270#[derive(Debug, Deserialize)]
272pub struct OllamaDeleteRequest {
273 pub name: String,
274}
275
276fn fxhash(s: &str) -> u64 {
278 let mut hash: u64 = 0xcbf29ce484222325;
279 for b in s.bytes() {
280 hash ^= b as u64;
281 hash = hash.wrapping_mul(0x100000001b3);
282 }
283 hash
284}