1use crate::config::Config;
14use crate::conversation::message::{ImagePart, Message, MessageContent, Role};
15use crate::provider::{create_provider, model_name_suggests_vision, LlmProvider};
16use futures::StreamExt;
17
18#[derive(Debug, Clone)]
20pub enum PreprocessOutcome {
21 Skipped,
25 Replaced { text: String, vl_key: String },
32 Failed { reason: String },
37}
38
39pub async fn maybe_preprocess(
49 config: &Config,
50 active_provider: &dyn LlmProvider,
51 caption: &str,
52 images: &[ImagePart],
53) -> PreprocessOutcome {
54 if images.is_empty() {
55 return PreprocessOutcome::Skipped;
56 }
57 if model_name_suggests_vision(active_provider.model_name()) {
58 return PreprocessOutcome::Skipped;
59 }
60 let vl_key = match config.vision_preprocessor_provider.as_deref() {
61 Some(k) if !k.is_empty() => k,
62 _ => return PreprocessOutcome::Skipped,
63 };
64 let vl_cfg = match config.providers.get(vl_key) {
65 Some(c) => c.clone(),
66 None => {
67 return PreprocessOutcome::Failed {
68 reason: format!("VL provider '{vl_key}' not found in config.providers"),
69 };
70 }
71 };
72
73 let vl_provider = match create_provider(&vl_cfg) {
76 Ok(p) => p,
77 Err(e) => {
78 return PreprocessOutcome::Failed {
79 reason: format!("VL provider build failed: {e:#}"),
80 };
81 }
82 };
83
84 let prompt = if caption.trim().is_empty() {
85 "请详细描述这张图片的内容。如果是代码、报错截图或终端输出,请逐字转录文本。"
86 .to_string()
87 } else {
88 format!(
89 "用户的当前请求:{caption}\n\n请详细描述这张图片的内容。如果是代码、\
90 报错截图或终端输出,请逐字转录文本。",
91 )
92 };
93
94 let messages = vec![Message {
98 role: Role::User,
99 content: MessageContent::MultiPart {
100 text: Some(prompt),
101 images: images.to_vec(),
102 },
103 }];
104
105 const IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
115
116 let mut stream = match vl_provider.chat_stream(&messages, None) {
117 Ok(s) => s,
118 Err(e) => {
119 return PreprocessOutcome::Failed {
120 reason: format!("provider '{vl_key}' stream init failed: {e:#}"),
121 };
122 }
123 };
124
125 let mut buf = String::new();
126 loop {
127 let next = match tokio::time::timeout(IDLE_TIMEOUT, stream.next()).await {
128 Ok(n) => n,
129 Err(_) => {
130 return PreprocessOutcome::Failed {
131 reason: format!(
132 "provider '{vl_key}' no progress for {}s",
133 IDLE_TIMEOUT.as_secs(),
134 ),
135 };
136 }
137 };
138 let event = match next {
139 None => break,
140 Some(Ok(ev)) => ev,
141 Some(Err(e)) => {
142 return PreprocessOutcome::Failed {
143 reason: format!("provider '{vl_key}' call error: {e:#}"),
144 };
145 }
146 };
147 match event {
148 crate::stream::StreamEvent::Delta(s) => buf.push_str(&s),
149 crate::stream::StreamEvent::Reasoning(_) => {}
150 crate::stream::StreamEvent::Done { .. } => break,
151 crate::stream::StreamEvent::Error(e) => {
152 return PreprocessOutcome::Failed {
153 reason: format!("provider '{vl_key}' call error: {e}"),
154 };
155 }
156 crate::stream::StreamEvent::Warning(_)
161 | crate::stream::StreamEvent::Usage(_)
162 | crate::stream::StreamEvent::ThinkingBlock { .. }
163 | crate::stream::StreamEvent::ToolCallStart { .. }
164 | crate::stream::StreamEvent::ToolCallDelta(_)
165 | crate::stream::StreamEvent::ToolCallDone(_) => {}
166 }
167 }
168
169 let trimmed = buf.trim();
170 if trimmed.is_empty() {
171 PreprocessOutcome::Failed {
172 reason: format!("provider '{vl_key}' returned empty response"),
173 }
174 } else {
175 PreprocessOutcome::Replaced {
176 text: trimmed.to_string(),
177 vl_key: vl_key.to_string(),
178 }
179 }
180}
181
182#[cfg(test)]
183mod tests {
184 use super::*;
185 use crate::config::provider::ProviderConfig;
186 use std::collections::HashMap;
187
188 fn blank_config() -> Config {
189 Config {
193 default_provider: String::new(),
194 default_workdir: None,
195 providers: HashMap::new(),
196 datalog: Default::default(),
197 auto_update: true,
198 notifications: Default::default(),
199 telemetry: Default::default(),
200 lsp: Default::default(),
201 auto_commit: false,
202 subagent: Default::default(),
203 vision_preprocessor_provider: None,
204 language: None,
205 ui: Default::default(),
206 plugin: Default::default(),
207 }
208 }
209
210 fn sample_image() -> ImagePart {
211 ImagePart {
212 media_type: "image/png".into(),
213 data: "iVBORw0KGgoAAAANSUhEUg==".into(),
214 }
215 }
216
217 struct StubProvider {
220 model: &'static str,
221 }
222 use crate::stream::StreamEvent;
223 use crate::tool::ToolDef;
224 use anyhow::Result;
225 use async_trait::async_trait;
226 use futures::Stream;
227 use std::pin::Pin;
228 #[async_trait]
229 impl LlmProvider for StubProvider {
230 fn chat_stream(
231 &self,
232 _messages: &[crate::conversation::message::Message],
233 _tools: Option<&[ToolDef]>,
234 ) -> Result<Pin<Box<dyn Stream<Item = Result<StreamEvent>> + Send>>> {
235 anyhow::bail!("stub never streams");
236 }
237 fn model_name(&self) -> &str {
238 self.model
239 }
240 }
241
242 #[tokio::test]
243 async fn skipped_when_no_images() {
244 let cfg = blank_config();
245 let provider = StubProvider { model: "deepseek-v4-flash" };
246 let result = maybe_preprocess(&cfg, &provider, "any caption", &[]).await;
247 assert!(matches!(result, PreprocessOutcome::Skipped));
248 }
249
250 #[tokio::test]
251 async fn skipped_when_main_provider_accepts_images() {
252 let cfg = blank_config();
253 let provider = StubProvider { model: "claude-sonnet-4-5" };
254 let result =
255 maybe_preprocess(&cfg, &provider, "describe", &[sample_image()]).await;
256 assert!(matches!(result, PreprocessOutcome::Skipped));
257 }
258
259 #[tokio::test]
260 async fn skipped_when_config_field_unset() {
261 let cfg = blank_config();
262 let provider = StubProvider { model: "deepseek-v4-flash" };
263 let result =
264 maybe_preprocess(&cfg, &provider, "describe", &[sample_image()]).await;
265 assert!(matches!(result, PreprocessOutcome::Skipped));
266 }
267
268 #[tokio::test]
269 async fn skipped_when_config_field_empty_string() {
270 let mut cfg = blank_config();
271 cfg.vision_preprocessor_provider = Some(String::new());
272 let provider = StubProvider { model: "deepseek-v4-flash" };
273 let result =
274 maybe_preprocess(&cfg, &provider, "describe", &[sample_image()]).await;
275 assert!(matches!(result, PreprocessOutcome::Skipped));
276 }
277
278 #[tokio::test]
279 async fn failed_when_configured_key_missing_from_providers() {
280 let mut cfg = blank_config();
281 cfg.vision_preprocessor_provider = Some("AtomGit-NoSuchModel".into());
282 let provider = StubProvider { model: "deepseek-v4-flash" };
283 let result =
284 maybe_preprocess(&cfg, &provider, "describe", &[sample_image()]).await;
285 match result {
286 PreprocessOutcome::Failed { reason } => {
287 assert!(
288 reason.contains("AtomGit-NoSuchModel") && reason.contains("not found"),
289 "expected 'not found' for missing key, got: {reason}",
290 );
291 }
292 other => panic!("expected Failed, got {other:?}"),
293 }
294 }
295
296 use wiremock::matchers::{method, path};
297 use wiremock::{Mock, MockServer, ResponseTemplate};
298
299 fn sse_one_token(text: &str) -> String {
303 let chunk = serde_json::json!({
304 "choices": [{
305 "delta": { "content": text },
306 "finish_reason": null,
307 }],
308 });
309 let done = serde_json::json!({
310 "choices": [{
311 "delta": {},
312 "finish_reason": "stop",
313 }],
314 });
315 format!("data: {}\n\ndata: {}\n\ndata: [DONE]\n\n", chunk, done)
316 }
317
318 fn vl_provider_cfg(base_url: &str) -> ProviderConfig {
319 ProviderConfig {
320 provider_type: "openai".into(),
321 api_key: Some("sk-test".into()),
322 model: "Qwen/Qwen3-VL-32B-Instruct".into(),
323 base_url: Some(base_url.to_string()),
324 system_prompt: None,
325 user_agent: None,
326 context_window: 8000,
327 max_tokens: None,
328 thinking_type: None,
329 thinking_keep: None,
330 reasoning_history: None,
331 thinking_enabled: None,
332 thinking_budget: None,
333 skip_tls_verify: false,
334 ephemeral: false,
335 }
336 }
337
338 #[tokio::test]
339 async fn replaced_when_vl_returns_text() {
340 let server = MockServer::start().await;
341 Mock::given(method("POST"))
342 .and(path("/chat/completions"))
343 .respond_with(
344 ResponseTemplate::new(200)
345 .insert_header("content-type", "text/event-stream")
346 .set_body_string(sse_one_token(
347 "Python stack trace showing ZeroDivisionError on line 42",
348 )),
349 )
350 .expect(1)
351 .mount(&server)
352 .await;
353
354 let mut cfg = blank_config();
355 cfg.providers.insert(
356 "vl".into(),
357 vl_provider_cfg(&server.uri()),
358 );
359 cfg.vision_preprocessor_provider = Some("vl".into());
360
361 let provider = StubProvider { model: "deepseek-v4-flash" };
362 let result =
363 maybe_preprocess(&cfg, &provider, "explain this", &[sample_image()]).await;
364
365 match result {
366 PreprocessOutcome::Replaced { text, vl_key } => {
367 assert_eq!(
368 text,
369 "Python stack trace showing ZeroDivisionError on line 42"
370 );
371 assert_eq!(vl_key, "vl", "Replaced must carry the configured key");
372 }
373 other => panic!("expected Replaced, got {other:?}"),
374 }
375 }
376
377 #[tokio::test]
378 async fn failed_when_vl_returns_500() {
379 let server = MockServer::start().await;
380 Mock::given(method("POST"))
381 .and(path("/chat/completions"))
382 .respond_with(ResponseTemplate::new(500).set_body_string("upstream error"))
383 .mount(&server)
386 .await;
387
388 let mut cfg = blank_config();
389 cfg.providers.insert(
390 "vl".into(),
391 vl_provider_cfg(&format!("{}/", server.uri())),
392 );
393 cfg.vision_preprocessor_provider = Some("vl".into());
394
395 let provider = StubProvider { model: "deepseek-v4-flash" };
396 let result =
397 maybe_preprocess(&cfg, &provider, "x", &[sample_image()]).await;
398
399 match result {
400 PreprocessOutcome::Failed { reason } => {
401 assert!(
402 reason.contains("VL call error") || reason.contains("500"),
403 "expected error reason mentioning failure, got: {reason}",
404 );
405 }
406 other => panic!("expected Failed, got {other:?}"),
407 }
408 }
409
410 #[tokio::test]
411 async fn failed_when_vl_returns_empty_string() {
412 let server = MockServer::start().await;
413 Mock::given(method("POST"))
414 .and(path("/chat/completions"))
415 .respond_with(
416 ResponseTemplate::new(200)
417 .insert_header("content-type", "text/event-stream")
418 .set_body_string(sse_one_token("")), )
420 .mount(&server)
421 .await;
422
423 let mut cfg = blank_config();
424 cfg.providers.insert(
425 "vl".into(),
426 vl_provider_cfg(&format!("{}/", server.uri())),
427 );
428 cfg.vision_preprocessor_provider = Some("vl".into());
429
430 let provider = StubProvider { model: "deepseek-v4-flash" };
431 let result =
432 maybe_preprocess(&cfg, &provider, "x", &[sample_image()]).await;
433
434 match result {
435 PreprocessOutcome::Failed { reason } => {
436 assert!(
437 reason.contains("empty"),
438 "expected 'empty' in reason, got: {reason}",
439 );
440 }
441 other => panic!("expected Failed for empty response, got {other:?}"),
442 }
443 }
444
445 use wiremock::Match;
447 struct BodyContains(String);
448 impl Match for BodyContains {
449 fn matches(&self, req: &wiremock::Request) -> bool {
450 String::from_utf8_lossy(&req.body).contains(&self.0)
451 }
452 }
453
454 struct BodyNotContains(String);
458 impl Match for BodyNotContains {
459 fn matches(&self, request: &wiremock::Request) -> bool {
460 !String::from_utf8_lossy(&request.body).contains(&self.0)
461 }
462 }
463
464 #[tokio::test]
465 async fn caption_is_included_in_vl_prompt() {
466 let server = MockServer::start().await;
467 Mock::given(method("POST"))
468 .and(path("/chat/completions"))
469 .and(BodyContains("用户的当前请求:解释这段代码".into()))
470 .respond_with(
471 ResponseTemplate::new(200)
472 .insert_header("content-type", "text/event-stream")
473 .set_body_string(sse_one_token("ok")),
474 )
475 .expect(1)
476 .mount(&server)
477 .await;
478
479 let mut cfg = blank_config();
480 cfg.providers.insert(
481 "vl".into(),
482 vl_provider_cfg(&format!("{}/", server.uri())),
483 );
484 cfg.vision_preprocessor_provider = Some("vl".into());
485
486 let provider = StubProvider { model: "deepseek-v4-flash" };
487 let result = maybe_preprocess(
488 &cfg,
489 &provider,
490 "解释这段代码",
491 &[sample_image()],
492 )
493 .await;
494
495 assert!(matches!(result, PreprocessOutcome::Replaced { .. }));
498 }
499
500 #[tokio::test]
501 async fn empty_caption_uses_pure_describe_prompt() {
502 let server = MockServer::start().await;
503 Mock::given(method("POST"))
505 .and(path("/chat/completions"))
506 .and(BodyContains("请详细描述这张图片的内容".into()))
507 .and(BodyNotContains("用户的当前请求:".into()))
508 .respond_with(
509 ResponseTemplate::new(200)
510 .insert_header("content-type", "text/event-stream")
511 .set_body_string(sse_one_token("ok")),
512 )
513 .expect(1)
514 .mount(&server)
515 .await;
516
517 let mut cfg = blank_config();
518 cfg.providers.insert(
519 "vl".into(),
520 vl_provider_cfg(&format!("{}/", server.uri())),
521 );
522 cfg.vision_preprocessor_provider = Some("vl".into());
523
524 let provider = StubProvider { model: "deepseek-v4-flash" };
525 let result = maybe_preprocess(&cfg, &provider, " ", &[sample_image()]).await;
526
527 assert!(matches!(result, PreprocessOutcome::Replaced { .. }));
528 }
529}