use std::path::Path;
use base64::Engine;
use serde_json::{json, Value};
use tracing::warn;
use crate::config::VisionConfig;
use crate::traits::{AttachmentProvenance, MessageAttachment};
use crate::agent::turn_render::RenderMode;
pub const VISION_SKIPPED_SYSTEM_HINT: &str = "[SYSTEM] User attached image(s) saved to disk; vision is disabled, unsupported, or the file could not be read — acknowledge receipt and explain you cannot view the image directly.";
pub fn encode_image_attachment(
path: &Path,
mime: &str,
max_bytes: u64,
) -> Result<String, EncodeImageError> {
let metadata = std::fs::metadata(path).map_err(EncodeImageError::Io)?;
if metadata.len() > max_bytes {
return Err(EncodeImageError::TooLarge {
size_bytes: metadata.len(),
max_bytes,
});
}
let bytes = std::fs::read(path).map_err(EncodeImageError::Io)?;
if bytes.len() as u64 > max_bytes {
return Err(EncodeImageError::TooLarge {
size_bytes: bytes.len() as u64,
max_bytes,
});
}
if !mime.starts_with("image/") {
return Err(EncodeImageError::UnsupportedMime(mime.to_string()));
}
Ok(base64::engine::general_purpose::STANDARD.encode(bytes))
}
#[derive(Debug)]
pub enum EncodeImageError {
Io(std::io::Error),
TooLarge { size_bytes: u64, max_bytes: u64 },
UnsupportedMime(String),
}
impl std::fmt::Display for EncodeImageError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Io(e) => write!(f, "failed to read image: {e}"),
Self::TooLarge {
size_bytes,
max_bytes,
} => write!(
f,
"image size {size_bytes} bytes exceeds vision cap {max_bytes} bytes"
),
Self::UnsupportedMime(m) => write!(f, "unsupported image mime: {m}"),
}
}
}
pub struct MultimodalBuildResult {
pub content: Value,
pub vision_skipped: bool,
}
pub struct ImageBlocksResult {
pub blocks: Vec<Value>,
pub encoded_any: bool,
pub skipped_any: bool,
}
pub fn build_image_blocks(
attachments: &[MessageAttachment],
mode: RenderMode,
vision: &VisionConfig,
) -> ImageBlocksResult {
let encode_vision = matches!(mode, RenderMode::Current) && vision.enabled;
if !encode_vision {
return ImageBlocksResult {
blocks: Vec::new(),
encoded_any: false,
skipped_any: attachments
.iter()
.any(|a| vision.mime_allowed(&a.mime_type)),
};
}
let image_attachments: Vec<_> = attachments
.iter()
.filter(|a| vision.mime_allowed(&a.mime_type))
.collect();
if image_attachments.is_empty() {
return ImageBlocksResult {
blocks: Vec::new(),
encoded_any: false,
skipped_any: false,
};
}
let mut blocks = Vec::new();
let mut encoded_any = false;
let mut skipped_any = false;
for attachment in image_attachments {
let path = Path::new(&attachment.local_path);
match encode_image_attachment(path, &attachment.mime_type, vision.max_image_bytes) {
Ok(data) => {
encoded_any = true;
blocks.push(json!({
"type": "image_url",
"image_url": {
"url": format!("data:{};base64,{}", attachment.mime_type, data)
}
}));
}
Err(err) => {
skipped_any = true;
warn!(
path = %attachment.local_path,
mime = %attachment.mime_type,
error = %err,
"Skipping vision encoding for attachment"
);
}
}
}
ImageBlocksResult {
blocks,
encoded_any,
skipped_any,
}
}
fn build_multimodal_content_force(
text: &str,
attachments: &[MessageAttachment],
encode_vision: bool,
vision: &VisionConfig,
) -> MultimodalBuildResult {
let text = text.trim();
if !encode_vision || attachments.is_empty() {
return MultimodalBuildResult {
content: Value::String(text.to_string()),
vision_skipped: !attachments.is_empty(),
};
}
let image_result = build_image_blocks(attachments, RenderMode::Current, vision);
if !image_result.encoded_any {
return MultimodalBuildResult {
content: Value::String(text.to_string()),
vision_skipped: true,
};
}
let mut blocks = Vec::new();
if !text.is_empty() {
blocks.push(json!({"type": "text", "text": text}));
}
blocks.extend(image_result.blocks);
MultimodalBuildResult {
content: Value::Array(blocks),
vision_skipped: image_result.skipped_any,
}
}
pub fn build_tool_observation_content(
label: &str,
attachments: &[MessageAttachment],
vision: &VisionConfig,
) -> MultimodalBuildResult {
let tool_images: Vec<MessageAttachment> = attachments
.iter()
.filter(|a| a.provenance == AttachmentProvenance::ToolObservation)
.cloned()
.collect();
build_multimodal_content_force(label, &tool_images, vision.enabled, vision)
}
pub fn format_tool_observation_label(tool_name: &str, result_hint: &str) -> String {
let hint = result_hint.trim();
if hint.is_empty() {
format!("[Tool observation image from {tool_name}]")
} else {
format!("[Tool observation image from {tool_name}: {hint}]")
}
}
pub fn content_value_as_text(content: &Value) -> Option<String> {
match content {
Value::String(s) => Some(s.clone()),
Value::Array(blocks) => {
let parts: Vec<String> = blocks
.iter()
.filter_map(|block| {
if block.get("type").and_then(|t| t.as_str()) == Some("text") {
block
.get("text")
.and_then(|t| t.as_str())
.map(str::to_string)
} else {
None
}
})
.collect();
if parts.is_empty() {
None
} else {
Some(parts.join("\n"))
}
}
_ => None,
}
}
pub fn user_message_content_matches(content: &Value, user_text: &str) -> bool {
content_value_as_text(content).is_some_and(|text| text == user_text)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn vision_config() -> VisionConfig {
VisionConfig {
enabled: true,
max_image_bytes: 4 * 1_048_576,
mime_types: default_vision_mime_types(),
}
}
fn default_vision_mime_types() -> Vec<String> {
vec![
"image/jpeg".to_string(),
"image/png".to_string(),
"image/gif".to_string(),
"image/webp".to_string(),
]
}
#[test]
fn build_multimodal_content_multiple_images() {
let mut f1 = NamedTempFile::new().unwrap();
f1.write_all(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A])
.unwrap();
let mut f2 = NamedTempFile::new().unwrap();
f2.write_all(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A])
.unwrap();
let attachments = vec![
MessageAttachment {
local_path: f1.path().to_string_lossy().into_owned(),
filename: "a.png".to_string(),
mime_type: "image/png".to_string(),
size_bytes: 8,
..Default::default()
},
MessageAttachment {
local_path: f2.path().to_string_lossy().into_owned(),
filename: "b.png".to_string(),
mime_type: "image/png".to_string(),
size_bytes: 8,
..Default::default()
},
];
let result = build_image_blocks(&attachments, RenderMode::Current, &vision_config());
let blocks = result.blocks;
assert_eq!(blocks.len(), 2);
assert_eq!(blocks[0]["type"], "image_url");
assert_eq!(blocks[1]["type"], "image_url");
}
#[test]
fn archived_mode_keeps_text_only() {
let attachments = vec![MessageAttachment {
local_path: "/tmp/missing.png".to_string(),
filename: "missing.png".to_string(),
mime_type: "image/png".to_string(),
size_bytes: 1,
..Default::default()
}];
let result = build_image_blocks(
&attachments,
RenderMode::Archived {
terminal_state: crate::events::TerminalState::Completed,
},
&vision_config(),
);
assert!(!result.encoded_any);
assert!(result.blocks.is_empty());
}
#[test]
fn missing_file_falls_back_to_text() {
let attachments = vec![MessageAttachment {
local_path: "/tmp/definitely-not-a-real-file-vision-test.png".to_string(),
filename: "x.png".to_string(),
mime_type: "image/png".to_string(),
size_bytes: 1,
..Default::default()
}];
let result = build_image_blocks(&attachments, RenderMode::Current, &vision_config());
assert!(!result.encoded_any);
assert!(result.skipped_any);
}
}