use std::sync::Arc;
use tracing::{info, warn};
use super::content::{Content, ImageData};
use super::model_profile::supports_vision_for;
use crate::mcp::manager::McpManager;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ImageMethod {
DirectVision,
McpCaption { server: String },
LocalOcr,
MetadataOnly,
}
#[derive(Debug, Clone)]
pub struct ImageProcessResult {
pub content: Content,
pub notice: String,
pub method: ImageMethod,
pub install_hint: Option<String>,
}
const VISION_TOOL_PATTERNS: &[&str] = &[
"describe_image",
"caption_image",
"analyze_image",
"image_to_text",
"ocr_image",
"vision",
];
pub async fn process_images(
text: String,
images: Vec<ImageData>,
model: &str,
mcp_manager: Option<&Arc<McpManager>>,
_working_dir: &str,
) -> ImageProcessResult {
if supports_vision_for(model) {
info!(model, "Vision model — sending images directly");
return ImageProcessResult {
content: Content::multipart(text, images),
notice: "Image sent directly to vision model".to_string(),
method: ImageMethod::DirectVision,
install_hint: None,
};
}
if let Some(manager) = mcp_manager
&& let Some((prefixed_tool, server_name)) = find_vision_mcp(manager)
{
match try_mcp_caption(manager, &prefixed_tool, &images).await {
Ok(caption) => {
info!(server = %server_name, "MCP vision caption succeeded");
let augmented = format!(
"[Image description from MCP server '{server_name}']\n{caption}\n\n{text}"
);
return ImageProcessResult {
content: Content::text(augmented),
notice: format!("Image analyzed via MCP server '{server_name}'"),
method: ImageMethod::McpCaption {
server: server_name,
},
install_hint: None,
};
}
Err(e) => {
warn!(server = %server_name, error = %e, "MCP vision caption failed, falling through");
}
}
}
if let Some(ocr_text) = try_tesseract_ocr(&images).await {
info!("Tesseract OCR succeeded");
let augmented = format!("[OCR text extracted from image]\n{ocr_text}\n\n{text}");
return ImageProcessResult {
content: Content::text(augmented),
notice: "Image text extracted via Tesseract OCR".to_string(),
method: ImageMethod::LocalOcr,
install_hint: None,
};
}
let meta = build_metadata(&images);
let augmented = format!("{meta}\n\n{text}");
ImageProcessResult {
content: Content::text(augmented),
notice: format!(
"Current model '{}' does not support images. \
Install tesseract or connect a vision MCP server for image processing.",
model
),
method: ImageMethod::MetadataOnly,
install_hint: Some(tesseract_install_cmd()),
}
}
fn tesseract_install_cmd() -> String {
if cfg!(target_os = "macos") {
"brew install tesseract".to_string()
} else if cfg!(target_os = "windows") {
"choco install tesseract".to_string()
} else {
if std::path::Path::new("/usr/bin/apt").exists() {
"sudo apt install -y tesseract-ocr".to_string()
} else if std::path::Path::new("/usr/bin/dnf").exists() {
"sudo dnf install -y tesseract".to_string()
} else if std::path::Path::new("/usr/bin/pacman").exists() {
"sudo pacman -S tesseract".to_string()
} else {
"sudo apt install -y tesseract-ocr".to_string()
}
}
}
fn find_vision_mcp(manager: &McpManager) -> Option<(String, String)> {
for (server_name, meta) in manager.server_meta() {
for tool_name in &meta.tool_names {
let lower = tool_name.to_lowercase();
if VISION_TOOL_PATTERNS.iter().any(|p| lower.contains(p)) {
return Some((tool_name.clone(), server_name.clone()));
}
}
}
None
}
async fn try_mcp_caption(
manager: &McpManager,
prefixed_tool: &str,
images: &[ImageData],
) -> Result<String, String> {
let img = images.first().ok_or("no images")?;
let data_url = img.to_data_url();
let args = serde_json::json!({
"image": data_url,
});
manager
.call_tool(prefixed_tool, &args.to_string())
.await
.map_err(|e| e.to_string())
}
async fn try_tesseract_ocr(images: &[ImageData]) -> Option<String> {
let check = tokio::process::Command::new("which")
.arg("tesseract")
.output()
.await
.ok()?;
if !check.status.success() {
return None;
}
let mut texts = Vec::new();
for (i, img) in images.iter().enumerate() {
let tmp_path = format!("{}/_collet_ocr_{}.png", std::env::temp_dir().display(), i);
if tokio::fs::write(&tmp_path, &img.bytes).await.is_err() {
continue;
}
let output = tokio::process::Command::new("tesseract")
.arg(&tmp_path)
.arg("stdout")
.output()
.await;
let _ = tokio::fs::remove_file(&tmp_path).await;
if let Ok(out) = output
&& out.status.success()
{
let text = String::from_utf8_lossy(&out.stdout).trim().to_string();
if !text.is_empty() {
texts.push(text);
}
}
}
if texts.is_empty() {
None
} else {
Some(texts.join("\n---\n"))
}
}
fn build_metadata(images: &[ImageData]) -> String {
let mut lines = Vec::new();
for (i, img) in images.iter().enumerate() {
let kb = img.bytes.len() / 1024;
lines.push(format!(
"[Image {}: {}, {}KB — cannot be displayed; model does not support vision]",
i + 1,
img.mime_type,
kb
));
}
lines.join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_build_metadata_single() {
let mut bytes = vec![0x89, 0x50, 0x4E, 0x47];
bytes.resize(1028, 0);
let img = ImageData::new(bytes);
let meta = build_metadata(&[img]);
assert!(meta.contains("image/png"));
assert!(meta.contains("1KB"));
}
#[test]
fn test_build_metadata_multiple() {
let mut bytes1 = vec![0x89, 0x50, 0x4E, 0x47];
bytes1.resize(2052, 0);
let img1 = ImageData::new(bytes1);
let mut bytes2 = vec![0xFF, 0xD8, 0xFF, 0xE0];
bytes2.resize(4100, 0);
let img2 = ImageData::new(bytes2);
let meta = build_metadata(&[img1, img2]);
assert!(meta.contains("Image 1"));
assert!(meta.contains("Image 2"));
assert!(meta.contains("image/png"));
assert!(meta.contains("image/jpeg"));
}
#[test]
fn test_find_vision_mcp_empty() {
let manager = McpManager::empty();
assert!(find_vision_mcp(&manager).is_none());
}
#[tokio::test]
async fn test_process_images_direct_vision() {
let img = ImageData::new(vec![0x89, 0x50, 0x4E, 0x47]);
let result = process_images(
"describe this".to_string(),
vec![img],
"gpt-4o",
None,
"/tmp",
)
.await;
assert_eq!(result.method, ImageMethod::DirectVision);
assert!(result.content.has_images());
assert!(result.install_hint.is_none());
}
#[tokio::test]
async fn test_process_images_metadata_fallback() {
let mut bytes = vec![0x89, 0x50, 0x4E, 0x47];
bytes.resize(516, 0);
let img = ImageData::new(bytes);
let result = process_images(
"describe this".to_string(),
vec![img],
"deepseek-chat",
None,
"/tmp",
)
.await;
assert!(
result.method == ImageMethod::MetadataOnly || result.method == ImageMethod::LocalOcr
);
assert!(!result.content.has_images());
if result.method == ImageMethod::MetadataOnly {
assert!(result.install_hint.is_some());
}
}
#[test]
fn test_tesseract_install_cmd() {
let cmd = tesseract_install_cmd();
assert!(!cmd.is_empty());
assert!(cmd.contains("tesseract"));
}
}