Skip to main content

hematite/tools/
vision.rs

1use crate::agent::inference::{ChatMessage, InferenceEngine};
2use base64::prelude::*;
3use serde_json::Value;
4use std::path::Path;
5
6pub fn encode_image_as_data_url(path: &Path) -> Result<String, String> {
7    if !path.exists() {
8        return Err(format!("File not found: {}", path.display()));
9    }
10
11    let data = std::fs::read(path).map_err(|e| format!("Failed to read image: {}", e))?;
12    let b64 = BASE64_STANDARD.encode(data);
13
14    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("png");
15    let mime = match ext.to_lowercase().as_str() {
16        "jpg" | "jpeg" => "image/jpeg",
17        "gif" => "image/gif",
18        "webp" => "image/webp",
19        _ => "image/png",
20    };
21
22    Ok(format!("data:{};base64,{}", mime, b64))
23}
24
25pub async fn vision_analyze(engine: &InferenceEngine, args: &Value) -> Result<String, String> {
26    let path_str = args
27        .get("path")
28        .and_then(|v| v.as_str())
29        .ok_or("Missing parameter: path")?;
30    let prompt = args
31        .get("prompt")
32        .and_then(|v| v.as_str())
33        .ok_or("Missing parameter: prompt")?;
34
35    let path = Path::new(path_str);
36    let url = encode_image_as_data_url(path).map_err(|e| {
37        if e.starts_with("File not found: ") {
38            format!("File not found: {}", path_str)
39        } else {
40            e
41        }
42    })?;
43
44    let messages = vec![
45        ChatMessage::system("You are a vision-capable technical assistant. Analyze the provided image (likely a screenshot, diagram, or UI mockup) and provide a concise technical summary or answer the specific query."),
46        ChatMessage::user_with_image(prompt, &url),
47    ];
48
49    // Use the main engine but with tools disabled for the vision-pass sub-call.
50    let (text, _, _, _) = engine.call_with_tools(&messages, &[], None).await?;
51
52    Ok(text.unwrap_or_else(|| "The vision model returned an empty response.".to_string()))
53}