1use axum::{
4 extract::State,
5 http::StatusCode,
6 response::{
7 sse::{Event, Sse},
8 IntoResponse, Json,
9 },
10};
11use std::convert::Infallible;
12use std::time::{SystemTime, UNIX_EPOCH};
13use tokio_stream::Stream;
14
15use super::state::BancoState;
16use super::types::{
17 BancoChatChunk, BancoChatRequest, BancoChatResponse, ChatChoice, ChatChunkChoice, ChatDelta,
18 ErrorResponse, Usage,
19};
20
21pub use super::handlers_conversations::{
23 create_conversation_handler, delete_conversation_handler, export_conversations_handler,
24 get_conversation_handler, import_conversations_handler, list_conversations_handler,
25 rename_conversation_handler, search_conversations_handler,
26};
27pub use super::handlers_prompts::{
28 delete_prompt_handler, get_prompt_handler, list_prompts_handler, save_prompt_handler,
29};
30use crate::serve::router::RoutingDecision;
31use crate::serve::templates::{ChatMessage, Role};
32
33pub async fn health_handler(State(state): State<BancoState>) -> Json<super::types::HealthResponse> {
38 Json(state.health_status())
39}
40
41pub async fn readiness_handler(State(state): State<BancoState>) -> StatusCode {
44 if state.model.is_loaded() {
45 StatusCode::OK
46 } else {
47 StatusCode::SERVICE_UNAVAILABLE
48 }
49}
50
51pub async fn liveness_handler() -> StatusCode {
54 StatusCode::OK
55}
56
57pub async fn models_handler(State(state): State<BancoState>) -> Json<super::types::ModelsResponse> {
62 Json(state.list_models())
63}
64
65pub async fn system_handler(State(state): State<BancoState>) -> Json<super::types::SystemResponse> {
70 Json(state.system_info())
71}
72
73pub async fn chat_completions_handler(
78 State(state): State<BancoState>,
79 Json(request): Json<BancoChatRequest>,
80) -> Result<impl IntoResponse, (StatusCode, Json<ErrorResponse>)> {
81 let mut request = request;
83 for msg in &mut request.messages {
84 msg.content = state.prompts.expand(&msg.content);
85 }
86
87 if !request.attachments.is_empty() {
89 let mut attachment_context = String::new();
90 for att in &request.attachments {
91 attachment_context
92 .push_str(&format!("[Attached file: {}]\n{}\n\n", att.name, att.content));
93 }
94 let att_msg = ChatMessage::system(format!(
95 "The user has attached the following files:\n\n{attachment_context}"
96 ));
97 request.messages.insert(0, att_msg);
98 }
99
100 if request.rag {
102 let query = request.messages.last().map(|m| m.content.as_str()).unwrap_or("");
103 let top_k = request.rag_config.as_ref().map(|c| c.top_k).unwrap_or(5);
104 let min_score = request.rag_config.as_ref().map(|c| c.min_score).unwrap_or(0.1);
105 let results = state.rag.search(query, top_k, min_score);
106 if !results.is_empty() {
107 let context: String = results
108 .iter()
109 .map(|r| format!("[Source: {} chunk {}]\n{}", r.file, r.chunk, r.text))
110 .collect::<Vec<_>>()
111 .join("\n\n");
112 let rag_msg = ChatMessage::system(format!(
113 "Use the following context to answer the user's question:\n\n{context}"
114 ));
115 request.messages.insert(0, rag_msg);
116 }
117 }
118
119 if request.messages.is_empty() {
121 return Err((
122 StatusCode::BAD_REQUEST,
123 Json(ErrorResponse::new("messages must not be empty", "invalid_request", 400)),
124 ));
125 }
126
127 if !state.context_manager.fits(&request.messages) {
129 return Err((
130 StatusCode::BAD_REQUEST,
131 Json(ErrorResponse::new(
132 "messages exceed context window",
133 "context_length_exceeded",
134 400,
135 )),
136 ));
137 }
138
139 if let Err(e) = state.circuit_breaker.check(0.001) {
141 return Err((
142 StatusCode::TOO_MANY_REQUESTS,
143 Json(ErrorResponse::new(e.to_string(), "rate_limit", 429)),
144 ));
145 }
146
147 if let Some(ref conv_id) = request.conversation_id {
149 for msg in &request.messages {
150 let _ = state.conversations.append(conv_id, msg.clone());
151 }
152 }
153
154 let decision = state.router.route();
156
157 if request.stream {
158 Ok(stream_response(state, request, decision).into_response())
159 } else {
160 Ok(sync_response(state, request, decision).into_response())
161 }
162}
163
164fn sync_response(
169 state: BancoState,
170 request: BancoChatRequest,
171 decision: RoutingDecision,
172) -> Json<BancoChatResponse> {
173 let model_name = request
174 .model
175 .clone()
176 .or_else(|| state.model.info().map(|m| m.model_id))
177 .unwrap_or_else(|| "banco-echo".to_string());
178
179 let _formatted = state.template_engine.apply(&request.messages);
181
182 #[cfg(feature = "realizar")]
184 let inference_result = super::handlers_inference::try_inference(&state, &request);
185 #[cfg(not(feature = "realizar"))]
186 let inference_result: Option<(String, String, u32)> = None;
187
188 let (content, finish_reason, actual_completion_tokens) = if let Some(result) = inference_result
189 {
190 result
191 } else if state.model.is_loaded() {
192 let model_info = state
193 .model
194 .info()
195 .map(|m| {
196 format!(
197 "{}({} layers, {}d)",
198 m.architecture.as_deref().unwrap_or("?"),
199 m.num_layers.unwrap_or(0),
200 m.hidden_dim.unwrap_or(0)
201 )
202 })
203 .unwrap_or_default();
204 (
205 format!(
206 "[banco model-loaded] {model_info} | route={decision:?} | prompt_len={} | inference=pending",
207 request.messages.len()
208 ),
209 "model_loaded".to_string(),
210 0u32,
211 )
212 } else {
213 let last_msg = request.messages.last().map(|m| m.content.as_str()).unwrap_or("");
214 let echo = if last_msg.len() > 200 { &last_msg[..200] } else { last_msg };
215 (
216 format!(
217 "No model loaded. To enable inference, load a GGUF model:\n\
218 curl -X POST http://localhost:8090/api/v1/models/load -d '{{\"model\": \"./model.gguf\"}}'\n\n\
219 Your message ({} chars, {} tokens estimated): {echo}",
220 last_msg.len(),
221 request.messages.len(),
222 ),
223 "dry_run".to_string(),
224 0u32,
225 )
226 };
227
228 let prompt_tokens = state.context_manager.estimate_tokens(&request.messages) as u32;
229 let completion_tokens = if actual_completion_tokens > 0 {
230 actual_completion_tokens
231 } else {
232 (content.len() / 4) as u32
233 };
234
235 if let Some(ref conv_id) = request.conversation_id {
237 let _ = state.conversations.append(conv_id, ChatMessage::assistant(&content));
238 }
239
240 Json(BancoChatResponse {
241 id: format!("banco-{}", now_epoch()),
242 object: "chat.completion".to_string(),
243 created: now_epoch(),
244 model: model_name,
245 choices: vec![ChatChoice {
246 index: 0,
247 message: ChatMessage::assistant(content),
248 finish_reason,
249 }],
250 usage: {
251 let total = prompt_tokens + completion_tokens;
252 let window = state.context_manager.available_tokens() as u32;
253 let pct = if window > 0 { (total as f32 / window as f32) * 100.0 } else { 0.0 };
254 Usage {
255 prompt_tokens,
256 completion_tokens,
257 total_tokens: total,
258 context_window: Some(window),
259 context_used_pct: Some(pct),
260 }
261 },
262 })
263}
264
265fn stream_response(
270 state: BancoState,
271 request: BancoChatRequest,
272 _decision: RoutingDecision,
273) -> Sse<impl Stream<Item = Result<Event, Infallible>>> {
274 let model_name = request.model.clone().unwrap_or_else(|| "banco-echo".to_string());
275 let id = format!("banco-{}", now_epoch());
276 let created = now_epoch();
277
278 #[cfg(feature = "realizar")]
280 let real_tokens = super::handlers_inference::try_stream_inference(&state, &request);
281 #[cfg(not(feature = "realizar"))]
282 let real_tokens: Option<Vec<(String, Option<String>)>> = {
283 let _ = &state;
284 None
285 };
286
287 let tokens_with_finish: Vec<(String, Option<String>)> = if let Some(toks) = real_tokens {
288 toks
289 } else {
290 vec![
292 ("No model loaded. ".to_string(), None),
293 ("Load a GGUF model via ".to_string(), None),
294 ("POST /api/v1/models/load ".to_string(), None),
295 ("to enable real inference.".to_string(), None),
296 (String::new(), Some("dry_run".to_string())),
297 ]
298 };
299
300 let stream = async_stream::stream! {
301 let role_chunk = BancoChatChunk {
303 id: id.clone(),
304 object: "chat.completion.chunk".to_string(),
305 created,
306 model: model_name.clone(),
307 choices: vec![ChatChunkChoice {
308 index: 0,
309 delta: ChatDelta { role: Some(Role::Assistant), content: None },
310 finish_reason: None,
311 }],
312 };
313 if let Ok(data) = serde_json::to_string(&role_chunk) {
314 yield Ok(Event::default().data(data));
315 }
316
317 for (text, finish) in &tokens_with_finish {
319 let chunk = BancoChatChunk {
320 id: id.clone(),
321 object: "chat.completion.chunk".to_string(),
322 created,
323 model: model_name.clone(),
324 choices: vec![ChatChunkChoice {
325 index: 0,
326 delta: ChatDelta {
327 role: None,
328 content: if text.is_empty() { None } else { Some(text.clone()) },
329 },
330 finish_reason: finish.clone(),
331 }],
332 };
333 if let Ok(data) = serde_json::to_string(&chunk) {
334 yield Ok(Event::default().data(data));
335 }
336 }
337
338 yield Ok(Event::default().data("[DONE]"));
340 };
341
342 Sse::new(stream)
343}
344
345pub use super::handlers_tokens::{
347 detokenize_handler, embeddings_handler, get_parameters_handler, tokenize_handler,
348 update_parameters_handler,
349};
350
351fn now_epoch() -> u64 {
352 SystemTime::now().duration_since(UNIX_EPOCH).unwrap_or_default().as_secs()
353}