collet 0.1.1

Relentless agentic coding orchestrator with zero-drop agent loops
Documentation
//! Image processing fallback chain for vision and non-vision models.
//!
//! When a user attaches images, the chain tries these strategies in order:
//! 1. **Direct Vision** — model supports multimodal input → send images directly.
//! 2. **MCP Caption** — an MCP server exposes a vision tool → get a text caption.
//! 3. **Local OCR** — `tesseract` is installed locally → extract text via OCR.
//! 4. **Metadata Only** — none of the above → pass image metadata + guidance.

use std::sync::Arc;

use tracing::{info, warn};

use super::content::{Content, ImageData};
use super::model_profile::supports_vision_for;
use crate::mcp::manager::McpManager;

/// How the images were processed.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ImageMethod {
    /// Model received images directly via multipart content.
    DirectVision,
    /// An MCP vision server produced a text caption.
    McpCaption { server: String },
    /// Local `tesseract` OCR extracted text.
    LocalOcr,
    /// Only image metadata was forwarded (no processing available).
    MetadataOnly,
}

/// Result of the image processing fallback chain.
#[derive(Debug, Clone)]
pub struct ImageProcessResult {
    /// Content to send to the LLM (multipart or text).
    pub content: Content,
    /// Human-readable notice for the TUI.
    pub notice: String,
    /// Which method was used.
    pub method: ImageMethod,
    /// Optional install hint (e.g., tesseract install command) when no processor is available.
    pub install_hint: Option<String>,
}

/// MCP tool name patterns that indicate vision / image-captioning capability.
const VISION_TOOL_PATTERNS: &[&str] = &[
    "describe_image",
    "caption_image",
    "analyze_image",
    "image_to_text",
    "ocr_image",
    "vision",
];

/// Run the image processing fallback chain.
///
/// The caller should only invoke this when `images` is non-empty.
pub async fn process_images(
    text: String,
    images: Vec<ImageData>,
    model: &str,
    mcp_manager: Option<&Arc<McpManager>>,
    _working_dir: &str,
) -> ImageProcessResult {
    // ── Step 1: Direct Vision ───────────────────────────────────────────
    if supports_vision_for(model) {
        info!(model, "Vision model — sending images directly");
        return ImageProcessResult {
            content: Content::multipart(text, images),
            notice: "Image sent directly to vision model".to_string(),
            method: ImageMethod::DirectVision,
            install_hint: None,
        };
    }

    // ── Step 2: MCP Vision Server ───────────────────────────────────────
    if let Some(manager) = mcp_manager
        && let Some((prefixed_tool, server_name)) = find_vision_mcp(manager)
    {
        match try_mcp_caption(manager, &prefixed_tool, &images).await {
            Ok(caption) => {
                info!(server = %server_name, "MCP vision caption succeeded");
                let augmented = format!(
                    "[Image description from MCP server '{server_name}']\n{caption}\n\n{text}"
                );
                return ImageProcessResult {
                    content: Content::text(augmented),
                    notice: format!("Image analyzed via MCP server '{server_name}'"),
                    method: ImageMethod::McpCaption {
                        server: server_name,
                    },
                    install_hint: None,
                };
            }
            Err(e) => {
                warn!(server = %server_name, error = %e, "MCP vision caption failed, falling through");
            }
        }
    }

    // ── Step 3: Local Tesseract OCR ─────────────────────────────────────
    if let Some(ocr_text) = try_tesseract_ocr(&images).await {
        info!("Tesseract OCR succeeded");
        let augmented = format!("[OCR text extracted from image]\n{ocr_text}\n\n{text}");
        return ImageProcessResult {
            content: Content::text(augmented),
            notice: "Image text extracted via Tesseract OCR".to_string(),
            method: ImageMethod::LocalOcr,
            install_hint: None,
        };
    }

    // ── Step 4: Metadata Only ───────────────────────────────────────────
    let meta = build_metadata(&images);
    let augmented = format!("{meta}\n\n{text}");
    ImageProcessResult {
        content: Content::text(augmented),
        notice: format!(
            "Current model '{}' does not support images. \
             Install tesseract or connect a vision MCP server for image processing.",
            model
        ),
        method: ImageMethod::MetadataOnly,
        install_hint: Some(tesseract_install_cmd()),
    }
}

/// Detect the platform-appropriate tesseract install command.
fn tesseract_install_cmd() -> String {
    if cfg!(target_os = "macos") {
        "brew install tesseract".to_string()
    } else if cfg!(target_os = "windows") {
        "choco install tesseract".to_string()
    } else {
        // Linux — try to detect package manager
        if std::path::Path::new("/usr/bin/apt").exists() {
            "sudo apt install -y tesseract-ocr".to_string()
        } else if std::path::Path::new("/usr/bin/dnf").exists() {
            "sudo dnf install -y tesseract".to_string()
        } else if std::path::Path::new("/usr/bin/pacman").exists() {
            "sudo pacman -S tesseract".to_string()
        } else {
            "sudo apt install -y tesseract-ocr".to_string()
        }
    }
}

// ── Helpers ─────────────────────────────────────────────────────────────────

/// Find an MCP server that exposes a vision-related tool.
/// Returns `(prefixed_tool_name, server_name)`.
fn find_vision_mcp(manager: &McpManager) -> Option<(String, String)> {
    for (server_name, meta) in manager.server_meta() {
        for tool_name in &meta.tool_names {
            let lower = tool_name.to_lowercase();
            if VISION_TOOL_PATTERNS.iter().any(|p| lower.contains(p)) {
                return Some((tool_name.clone(), server_name.clone()));
            }
        }
    }
    None
}

/// Call an MCP vision tool with the first image as base64.
async fn try_mcp_caption(
    manager: &McpManager,
    prefixed_tool: &str,
    images: &[ImageData],
) -> Result<String, String> {
    let img = images.first().ok_or("no images")?;
    let data_url = img.to_data_url();

    let args = serde_json::json!({
        "image": data_url,
    });

    manager
        .call_tool(prefixed_tool, &args.to_string())
        .await
        .map_err(|e| e.to_string())
}

/// Try extracting text from images using local `tesseract`.
async fn try_tesseract_ocr(images: &[ImageData]) -> Option<String> {
    // Check if tesseract is available.
    let check = tokio::process::Command::new("which")
        .arg("tesseract")
        .output()
        .await
        .ok()?;
    if !check.status.success() {
        return None;
    }

    let mut texts = Vec::new();
    for (i, img) in images.iter().enumerate() {
        let tmp_path = format!("{}/_collet_ocr_{}.png", std::env::temp_dir().display(), i);

        // Write image to temp file.
        if tokio::fs::write(&tmp_path, &img.bytes).await.is_err() {
            continue;
        }

        // Run tesseract.
        let output = tokio::process::Command::new("tesseract")
            .arg(&tmp_path)
            .arg("stdout")
            .output()
            .await;

        // Clean up.
        let _ = tokio::fs::remove_file(&tmp_path).await;

        if let Ok(out) = output
            && out.status.success()
        {
            let text = String::from_utf8_lossy(&out.stdout).trim().to_string();
            if !text.is_empty() {
                texts.push(text);
            }
        }
    }

    if texts.is_empty() {
        None
    } else {
        Some(texts.join("\n---\n"))
    }
}

/// Build a metadata-only description for images that cannot be processed.
fn build_metadata(images: &[ImageData]) -> String {
    let mut lines = Vec::new();
    for (i, img) in images.iter().enumerate() {
        let kb = img.bytes.len() / 1024;
        lines.push(format!(
            "[Image {}: {}, {}KB — cannot be displayed; model does not support vision]",
            i + 1,
            img.mime_type,
            kb
        ));
    }
    lines.join("\n")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_build_metadata_single() {
        let mut bytes = vec![0x89, 0x50, 0x4E, 0x47];
        bytes.resize(1028, 0);
        let img = ImageData::new(bytes);
        let meta = build_metadata(&[img]);
        assert!(meta.contains("image/png"));
        assert!(meta.contains("1KB"));
    }

    #[test]
    fn test_build_metadata_multiple() {
        let mut bytes1 = vec![0x89, 0x50, 0x4E, 0x47];
        bytes1.resize(2052, 0);
        let img1 = ImageData::new(bytes1);
        let mut bytes2 = vec![0xFF, 0xD8, 0xFF, 0xE0];
        bytes2.resize(4100, 0);
        let img2 = ImageData::new(bytes2);
        let meta = build_metadata(&[img1, img2]);
        assert!(meta.contains("Image 1"));
        assert!(meta.contains("Image 2"));
        assert!(meta.contains("image/png"));
        assert!(meta.contains("image/jpeg"));
    }

    #[test]
    fn test_find_vision_mcp_empty() {
        let manager = McpManager::empty();
        assert!(find_vision_mcp(&manager).is_none());
    }

    #[tokio::test]
    async fn test_process_images_direct_vision() {
        let img = ImageData::new(vec![0x89, 0x50, 0x4E, 0x47]);
        let result = process_images(
            "describe this".to_string(),
            vec![img],
            "gpt-4o",
            None,
            "/tmp",
        )
        .await;
        assert_eq!(result.method, ImageMethod::DirectVision);
        assert!(result.content.has_images());
        assert!(result.install_hint.is_none());
    }

    #[tokio::test]
    async fn test_process_images_metadata_fallback() {
        let mut bytes = vec![0x89, 0x50, 0x4E, 0x47];
        bytes.resize(516, 0);
        let img = ImageData::new(bytes);
        let result = process_images(
            "describe this".to_string(),
            vec![img],
            "deepseek-chat",
            None,
            "/tmp",
        )
        .await;
        // Without tesseract in test env, should fall to MetadataOnly or LocalOcr
        assert!(
            result.method == ImageMethod::MetadataOnly || result.method == ImageMethod::LocalOcr
        );
        assert!(!result.content.has_images());
        // MetadataOnly should have install hint, LocalOcr should not
        if result.method == ImageMethod::MetadataOnly {
            assert!(result.install_hint.is_some());
        }
    }

    #[test]
    fn test_tesseract_install_cmd() {
        let cmd = tesseract_install_cmd();
        assert!(!cmd.is_empty());
        assert!(cmd.contains("tesseract"));
    }
}