use anyhow::{anyhow, Result};
use serde::{Deserialize, Serialize};
use std::env;
use std::time::Duration;
use tracing::{debug, info, warn};
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ComputerUseFunctionCall {
pub name: String,
#[serde(default)]
pub args: serde_json::Value,
pub id: Option<String>,
}
#[derive(Debug, Clone)]
pub struct ComputerUseResponse {
pub completed: bool,
pub function_call: Option<ComputerUseFunctionCall>,
pub text: Option<String>,
pub safety_decision: Option<String>,
}
#[derive(Debug, Serialize, Clone)]
pub struct ComputerUsePreviousAction {
pub name: String,
pub response: ComputerUseActionResponse,
pub screenshot: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub url: Option<String>,
}
#[derive(Debug, Serialize, Clone)]
pub struct ComputerUseActionResponse {
pub success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct ComputerUseStep {
pub step: u32,
pub action: String,
pub args: serde_json::Value,
pub success: bool,
pub error: Option<String>,
pub text: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct ComputerUseResult {
pub status: String,
pub goal: String,
pub steps_executed: u32,
pub final_action: String,
pub final_text: Option<String>,
pub steps: Vec<ComputerUseStep>,
pub pending_confirmation: Option<serde_json::Value>,
pub execution_id: Option<String>,
}
pub type ProgressCallback = Box<dyn Fn(&ComputerUseStep) + Send + Sync>;
#[derive(Debug, Deserialize)]
struct ComputerUseBackendResponse {
completed: bool,
#[serde(default)]
function_call: Option<ComputerUseFunctionCall>,
text: Option<String>,
safety_decision: Option<String>,
#[allow(dead_code)]
duration_ms: Option<u64>,
#[allow(dead_code)]
model_used: Option<String>,
error: Option<String>,
}
pub async fn call_computer_use_backend(
base64_image: &str,
goal: &str,
previous_actions: Option<&[ComputerUsePreviousAction]>,
) -> Result<ComputerUseResponse> {
let backend_url = env::var("GEMINI_COMPUTER_USE_BACKEND_URL")
.unwrap_or_else(|_| "https://app.mediar.ai/api/vision/computer-use".to_string());
info!(
"[computer_use] Calling backend at {} (goal: {})",
backend_url,
&goal[..goal.len().min(50)]
);
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(300))
.build()?;
let payload = serde_json::json!({
"image": base64_image,
"goal": goal,
"previous_actions": previous_actions.unwrap_or(&[])
});
let resp = client
.post(&backend_url)
.header("Content-Type", "application/json")
.json(&payload)
.send()
.await?;
let status = resp.status();
if !status.is_success() {
let text = resp.text().await.unwrap_or_default();
warn!("[computer_use] Backend error: {} - {}", status, text);
return Err(anyhow!("Computer Use backend error ({}): {}", status, text));
}
let response_text = resp.text().await?;
debug!(
"[computer_use] Backend response: {}",
&response_text[..response_text.len().min(500)]
);
let backend_response: ComputerUseBackendResponse = serde_json::from_str(&response_text)
.map_err(|e| anyhow!("Failed to parse backend response: {}", e))?;
if let Some(error) = backend_response.error {
return Err(anyhow!("Computer Use error: {}", error));
}
Ok(ComputerUseResponse {
completed: backend_response.completed,
function_call: backend_response.function_call,
text: backend_response.text,
safety_decision: backend_response.safety_decision,
})
}
pub fn translate_gemini_keys(gemini_keys: &str) -> Result<String, String> {
let parts: Vec<&str> = gemini_keys.split('+').collect();
let mut result = String::new();
for (i, part) in parts.iter().enumerate() {
let lower = part.trim().to_lowercase();
let is_last = i == parts.len() - 1;
let translated: &str = match lower.as_str() {
"control" | "ctrl" => "{Ctrl}",
"alt" => "{Alt}",
"shift" => "{Shift}",
"meta" | "cmd" | "command" | "win" | "windows" | "super" => "{Win}",
"enter" | "return" => "{Enter}",
"tab" => "{Tab}",
"escape" | "esc" => "{Escape}",
"backspace" | "back" => "{Backspace}",
"delete" | "del" => "{Delete}",
"space" => "{Space}",
"insert" | "ins" => "{Insert}",
"home" => "{Home}",
"end" => "{End}",
"pageup" | "pgup" => "{PageUp}",
"pagedown" | "pgdown" | "pgdn" => "{PageDown}",
"printscreen" | "prtsc" => "{PrintScreen}",
"up" | "arrowup" => "{Up}",
"down" | "arrowdown" => "{Down}",
"left" | "arrowleft" => "{Left}",
"right" | "arrowright" => "{Right}",
s if s.starts_with('f') && s.len() >= 2 => {
if let Ok(num) = s[1..].parse::<u8>() {
if (1..=24).contains(&num) {
match num {
1 => "{F1}",
2 => "{F2}",
3 => "{F3}",
4 => "{F4}",
5 => "{F5}",
6 => "{F6}",
7 => "{F7}",
8 => "{F8}",
9 => "{F9}",
10 => "{F10}",
11 => "{F11}",
12 => "{F12}",
13 => "{F13}",
14 => "{F14}",
15 => "{F15}",
16 => "{F16}",
17 => "{F17}",
18 => "{F18}",
19 => "{F19}",
20 => "{F20}",
21 => "{F21}",
22 => "{F22}",
23 => "{F23}",
24 => "{F24}",
_ => unreachable!(),
}
} else {
return Err(format!(
"Invalid function key '{}' in '{}'. Use f1-f24.",
part, gemini_keys
));
}
} else {
return Err(format!(
"Invalid function key '{}' in '{}'. Use f1-f24.",
part, gemini_keys
));
}
}
s if s.len() == 1 && is_last => {
result.push_str(s);
continue;
}
unknown => {
return Err(format!(
"Unknown key '{}' in combination '{}'. Valid: enter, tab, escape, \
backspace, delete, space, up/down/left/right, home, end, pageup, \
pagedown, f1-f24, or modifiers (ctrl, alt, shift, meta) with letters.",
unknown, gemini_keys
));
}
};
result.push_str(translated);
}
Ok(result)
}
#[allow(clippy::too_many_arguments)]
pub fn convert_normalized_to_screen(
norm_x: f64,
norm_y: f64,
window_x: f64,
window_y: f64,
screenshot_w: f64,
screenshot_h: f64,
dpi_scale: f64,
resize_scale: f64,
) -> (f64, f64) {
let px_x = (norm_x / 1000.0) * screenshot_w;
let px_y = (norm_y / 1000.0) * screenshot_h;
let px_x = px_x / resize_scale;
let px_y = px_y / resize_scale;
let logical_x = px_x / dpi_scale;
let logical_y = px_y / dpi_scale;
let screen_x = window_x + logical_x;
let screen_y = window_y + logical_y;
(screen_x, screen_y)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_translate_gemini_keys_simple() {
assert_eq!(translate_gemini_keys("enter").unwrap(), "{Enter}");
assert_eq!(translate_gemini_keys("tab").unwrap(), "{Tab}");
assert_eq!(translate_gemini_keys("escape").unwrap(), "{Escape}");
}
#[test]
fn test_translate_gemini_keys_modifiers() {
assert_eq!(translate_gemini_keys("control+a").unwrap(), "{Ctrl}a");
assert_eq!(translate_gemini_keys("ctrl+c").unwrap(), "{Ctrl}c");
assert_eq!(
translate_gemini_keys("Meta+Shift+T").unwrap(),
"{Win}{Shift}t"
);
}
#[test]
fn test_translate_gemini_keys_function() {
assert_eq!(translate_gemini_keys("f1").unwrap(), "{F1}");
assert_eq!(translate_gemini_keys("f12").unwrap(), "{F12}");
assert_eq!(translate_gemini_keys("alt+f4").unwrap(), "{Alt}{F4}");
}
#[test]
fn test_convert_normalized_coords() {
let (x, y) = convert_normalized_to_screen(500.0, 500.0, 0.0, 0.0, 1000.0, 1000.0, 1.0, 1.0);
assert!((x - 500.0).abs() < 0.001);
assert!((y - 500.0).abs() < 0.001);
let (x, y) =
convert_normalized_to_screen(500.0, 500.0, 100.0, 200.0, 1000.0, 1000.0, 1.0, 1.0);
assert!((x - 600.0).abs() < 0.001);
assert!((y - 700.0).abs() < 0.001);
}
}