use anyhow::{Context, Result};
use base64::{Engine, engine::general_purpose::STANDARD as BASE64};
use serde_json::Value;
use sha2::{Digest, Sha256};
use std::fs;
use std::path::Path;
use super::ImageInfo;
pub(super) fn count_images_in_jsonl(content: &str) -> Result<usize> {
let mut count = 0;
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
let msg: Value = serde_json::from_str(line).context("Failed to parse JSONL line")?;
count += count_images_in_value(&msg);
}
Ok(count)
}
fn count_images_in_value(value: &Value) -> usize {
match value {
Value::Object(map) => {
if let Some(Value::String(type_val)) = map.get("type")
&& type_val == "image"
&& let Some(Value::Object(source)) = map.get("source")
&& let Some(Value::String(source_type)) = source.get("type")
&& source_type == "base64"
{
1
} else {
map.values().map(count_images_in_value).sum()
}
}
Value::Array(arr) => arr.iter().map(count_images_in_value).sum(),
_ => 0,
}
}
pub(super) fn extract_images_from_jsonl(
content: &str,
images_dir: &Path,
) -> Result<(String, Vec<ImageInfo>)> {
let mut images = Vec::new();
let mut modified_lines = Vec::new();
for line in content.lines() {
if line.trim().is_empty() {
modified_lines.push(line.to_string());
continue;
}
let mut msg: Value = serde_json::from_str(line).context("Failed to parse JSONL line")?;
extract_images_from_value(&mut msg, images_dir, &mut images)?;
modified_lines.push(serde_json::to_string(&msg)?);
}
Ok((modified_lines.join("\n") + "\n", images))
}
fn extract_images_from_value(
value: &mut Value,
images_dir: &Path,
images: &mut Vec<ImageInfo>,
) -> Result<()> {
match value {
Value::Object(map) => {
if let Some(Value::String(type_val)) = map.get("type")
&& type_val == "image"
&& let Some(Value::Object(source)) = map.get("source")
&& let Some(Value::String(source_type)) = source.get("type")
&& source_type == "base64"
&& let Some(Value::String(media_type)) = source.get("media_type")
&& let Some(Value::String(data)) = source.get("data")
{
let tool_use_id = map
.get("tool_use_id")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let media_type = media_type.clone();
let data = data.clone();
let (hash, size_bytes) = hash_image_data(&data)?;
let file_ref = save_image(&data, &hash, &media_type, images_dir)?;
if !images.iter().any(|img| img.hash == hash) {
images.push(ImageInfo {
hash: hash.clone(),
media_type: media_type.clone(),
size_bytes,
original_tool_use_id: tool_use_id,
});
}
if let Some(Value::Object(source)) = map.get_mut("source") {
source.clear();
source.insert("type".to_string(), Value::String("file".to_string()));
source.insert("file".to_string(), Value::String(file_ref));
}
} else {
for val in map.values_mut() {
extract_images_from_value(val, images_dir, images)?;
}
}
}
Value::Array(arr) => {
for item in arr.iter_mut() {
extract_images_from_value(item, images_dir, images)?;
}
}
_ => {}
}
Ok(())
}
fn hash_image_data(base64_data: &str) -> Result<(String, u64)> {
let image_bytes = BASE64
.decode(base64_data)
.context("Failed to decode base64 image")?;
let mut hasher = Sha256::new();
hasher.update(&image_bytes);
let hash = format!("{:x}", hasher.finalize());
Ok((hash, image_bytes.len() as u64))
}
fn save_image(
base64_data: &str,
hash: &str,
media_type: &str,
images_dir: &Path,
) -> Result<String> {
let image_bytes = BASE64
.decode(base64_data)
.context("Failed to decode base64 image")?;
let ext = match media_type {
"image/png" => "png",
"image/jpeg" => "jpg",
"image/webp" => "webp",
"image/gif" => "gif",
"image/svg+xml" => "svg",
unknown => {
eprintln!(
"Warning: unknown image media type '{}', saving as .bin",
unknown
);
"bin"
}
};
let filename = format!("{}.{}", hash, ext);
let file_path = images_dir.join(&filename);
if !file_path.exists() {
fs::write(&file_path, image_bytes)
.with_context(|| format!("Failed to write image file: {}", filename))?;
}
Ok(format!("images/{}", filename))
}