use std::{
collections::BTreeMap,
sync::{
Arc,
atomic::{AtomicBool, Ordering},
},
};
use anyhow::{Context as _, Result};
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64};
use futures::StreamExt;
use tracing::{debug, info, warn};
use super::{
action::{Action, ExecCtx, MouseButton, ParsedAction, ScrollDir},
app_rules::AppRuleSet,
operator::Operator,
parser::{CoordFormat, parse_vlm_response},
permission::{PermissionDecision, PermissionRequest, PermissionStore},
prompt::{PromptInputs, build_system_prompt},
status::ComputerUseStatus,
};
use crate::provider::{
AgentEndpoint, ContentPart, LlmProvider, LlmRequest, Message, MessageContent, Role, StreamEvent,
};
#[derive(Debug, Clone)]
pub enum DriverOutcome {
Finished { content: String, steps: usize },
CallUser { reason: String, steps: usize },
MaxLoop { steps: usize },
UserAbort { steps: usize },
PermissionDenied,
OperatorError { message: String, steps: usize },
}
#[derive(Debug, Clone)]
pub struct Step {
pub thought: String,
pub action_summary: String,
pub result_ok: bool,
pub result_message: Option<String>,
}
pub struct VlmDriver<'a> {
pub operator: &'a dyn Operator,
pub provider: Arc<dyn LlmProvider>,
pub model_name: String,
pub coord_format: CoordFormat,
pub max_loop: usize,
pub abort: Arc<AtomicBool>,
pub app_rules: &'a AppRuleSet,
pub permission: Arc<dyn PermissionStore>,
pub agent_id: String,
pub app: String,
pub permission_emit: Option<Arc<dyn Fn(PermissionRequest) + Send + Sync + 'a>>,
pub headless_auto_allow: bool,
pub status_emit: Option<Arc<dyn Fn(ComputerUseStatus) + Send + Sync + 'a>>,
pub run_id: String,
}
impl VlmDriver<'_> {
pub async fn run(&self, instruction: &str) -> Result<DriverOutcome> {
let outcome = self.run_inner(instruction).await?;
self.emit_finished(&outcome);
Ok(outcome)
}
async fn run_inner(&self, instruction: &str) -> Result<DriverOutcome> {
if let Some(deny) = self.permission_gate(instruction).await? {
return Ok(deny);
}
self.emit_started(instruction);
let probe_snap = self
.operator
.screenshot()
.await
.context("initial screenshot")?;
let probe_dims = probe_snap.physical_size;
let mut next_snap: Option<super::action::Screenshot> = Some(probe_snap);
let action_spaces = self.operator.action_spaces();
let matched: Vec<&_> = self.app_rules.match_instruction(instruction);
let system_prompt = build_system_prompt(&PromptInputs {
instruction,
action_spaces: &action_spaces,
matched_rules: &matched,
screen_size: Some(probe_dims),
});
info!(
agent = %self.agent_id,
app = %self.app,
operator = %self.operator.name(),
model = %self.model_name,
max_loop = self.max_loop,
matched_rules = matched.len(),
screen = format!("{}x{}", probe_dims.0, probe_dims.1),
"VlmDriver.run starting"
);
let mut history: Vec<Step> = Vec::new();
let mut steps = 0usize;
let mut consecutive_unparseable = 0usize;
const MAX_CONSECUTIVE_UNPARSEABLE: usize = 3;
loop {
if self.abort.load(Ordering::SeqCst) {
return Ok(DriverOutcome::UserAbort { steps });
}
if steps >= self.max_loop {
return Ok(DriverOutcome::MaxLoop { steps });
}
let snap = if let Some(s) = next_snap.take() {
s
} else {
match self.operator.screenshot().await {
Ok(s) => s,
Err(e) => {
warn!(error = %e, "screenshot failed");
return Ok(DriverOutcome::OperatorError {
message: format!("screenshot: {e}"),
steps,
});
}
}
};
let snap_b64 = BASE64.encode(&snap.png_bytes);
let screen_w = snap.physical_size.0;
let screen_h = snap.physical_size.1;
let scale = snap.scale_factor;
self.emit_thinking(steps + 1);
let user_text = build_user_message(instruction, &history);
let messages = vec![Message {
role: Role::User,
content: MessageContent::Parts(vec![
ContentPart::Text { text: user_text },
ContentPart::Image {
url: format!("data:image/png;base64,{snap_b64}"),
},
]),
rsclaw_hidden: None,
}];
let req = LlmRequest {
fallback_models: Vec::new(),
model: self.model_name.clone(),
messages,
tools: Vec::new(),
system: Some(system_prompt.clone()),
max_tokens: Some(2048),
temperature: Some(0.0),
frequency_penalty: None,
thinking_budget: None,
endpoint: AgentEndpoint::Vision,
kv_cache_mode: 0,
session_key: None,
system_shared: None,
user_system: None,
recall: None,
};
let prediction =
match stream_prediction(self.provider.as_ref(), req, self.abort.as_ref()).await {
Ok(p) => p,
Err(e) if e.to_string().contains(STREAM_ABORTED) => {
return Ok(DriverOutcome::UserAbort { steps });
}
Err(e) => {
warn!(error = %e, "VLM stream failed");
return Ok(DriverOutcome::OperatorError {
message: format!("vlm stream: {e}"),
steps,
});
}
};
debug!(prediction_len = prediction.len(), "VLM prediction received");
let mut parsed = parse_vlm_response(&prediction, self.coord_format);
if parsed.is_empty() {
consecutive_unparseable += 1;
warn!(
prediction = %prediction.chars().take(200).collect::<String>(),
streak = consecutive_unparseable,
"VLM produced no parseable actions"
);
if consecutive_unparseable >= MAX_CONSECUTIVE_UNPARSEABLE {
return Ok(DriverOutcome::OperatorError {
message: format!(
"model produced no `Action:` line for {} consecutive turns. \
First reply preview: {}",
consecutive_unparseable,
prediction.chars().take(200).collect::<String>(),
),
steps,
});
}
let step = Step {
thought: String::new(),
action_summary: "(no parseable action — your reply was missing the required `Action: ...` line)".to_owned(),
result_ok: false,
result_message: Some(
"Reminder: every reply must end with one `Action:` line picking from the Action Space (click/type/scroll/wait/finished/etc). Do NOT discuss tools."
.to_owned(),
),
};
self.emit_step(steps + 1, &step);
history.push(step);
steps += 1;
continue;
}
if parsed.len() > 1 {
warn!(
action_count = parsed.len(),
first_action = %parsed[0].action_type,
"VLM emitted multiple actions in one turn; executing only the first so every action gets a fresh screenshot"
);
parsed.truncate(1);
}
consecutive_unparseable = 0;
for pa in parsed {
let summary = summarize_parsed(&pa);
info!(
step = steps + 1,
action_type = %pa.action_type,
raw_start = ?pa.start,
raw_end = ?pa.end,
screen_w,
screen_h,
scale,
"VLM action parsed"
);
match pa.action_type.as_str() {
"finished" => {
let content = terminal_action_text(&pa, &["content"])
.unwrap_or_else(|| pa.thought.clone());
let verified = verify_finished_claim(
self.provider.as_ref(),
&self.model_name,
instruction,
&history,
&pa.thought,
&content,
&snap_b64,
self.abort.as_ref(),
)
.await;
let step = Step {
thought: pa.thought.clone(),
action_summary: summary,
result_ok: verified,
result_message: if verified {
None
} else {
Some(
"Completion verifier could not confirm the requested end state from the current screenshot; continue instead of returning completed=true."
.to_owned(),
)
},
};
self.emit_step(steps + 1, &step);
history.push(step);
steps += 1;
if verified {
info!(steps, "VlmDriver: finished");
return Ok(DriverOutcome::Finished { content, steps });
}
continue;
}
"call_user" => {
let reason = terminal_action_text(&pa, &["reason", "content"])
.unwrap_or_else(|| pa.thought.clone());
info!(steps, "VlmDriver: call_user");
let step = Step {
thought: pa.thought.clone(),
action_summary: summary,
result_ok: true,
result_message: None,
};
self.emit_step(steps + 1, &step);
history.push(step);
return Ok(DriverOutcome::CallUser { reason, steps });
}
"error_env" => {
return Ok(DriverOutcome::OperatorError {
message: pa
.raw_args
.get("content")
.cloned()
.unwrap_or_else(|| "error_env".to_owned()),
steps,
});
}
_ => {}
}
let Some(action) = parsed_to_action(&pa, screen_w, screen_h) else {
warn!(
action_type = %pa.action_type,
"could not map parsed action; skipping"
);
let step = Step {
thought: pa.thought.clone(),
action_summary: summary,
result_ok: false,
result_message: Some("unmapped action type".to_owned()),
};
self.emit_step(steps + 1, &step);
history.push(step);
steps += 1;
if steps >= self.max_loop {
return Ok(DriverOutcome::MaxLoop { steps });
}
continue;
};
let ctx = ExecCtx {
screen_w,
screen_h,
scale_factor: scale,
factors: [screen_w.max(1), screen_h.max(1)],
};
let exec_result = match self.operator.execute(&action, &ctx).await {
Ok(r) => r,
Err(e) => {
return Ok(DriverOutcome::OperatorError {
message: format!("operator.execute: {e}"),
steps,
});
}
};
let step = Step {
thought: pa.thought.clone(),
action_summary: summary,
result_ok: exec_result.ok,
result_message: exec_result.message.clone(),
};
info!(
step = steps + 1,
action = %step.action_summary,
ok = step.result_ok,
message = ?step.result_message,
"VLM action executed"
);
self.emit_step(steps + 1, &step);
history.push(step);
steps += 1;
if self.abort.load(Ordering::SeqCst) {
return Ok(DriverOutcome::UserAbort { steps });
}
if steps >= self.max_loop {
return Ok(DriverOutcome::MaxLoop { steps });
}
}
}
}
async fn permission_gate(&self, instruction: &str) -> Result<Option<DriverOutcome>> {
if self.permission.bypass_all() {
return Ok(None);
}
let app = if self.app.is_empty() {
self.operator.name().to_owned()
} else {
self.app.clone()
};
match self.permission.check(&self.agent_id, &app).await? {
Some(PermissionDecision::AllowAlways)
| Some(PermissionDecision::AllowSession)
| Some(PermissionDecision::AllowOnce) => Ok(None),
Some(PermissionDecision::Deny) => Ok(Some(DriverOutcome::PermissionDenied)),
None => {
let Some(emit) = self.permission_emit.as_ref() else {
if self.headless_auto_allow {
info!("no permission emitter; headless_auto_allow → AllowOnce");
self.permission
.record(&self.agent_id, &app, PermissionDecision::AllowOnce)
.await
.ok();
return Ok(None);
}
tracing::warn!(
agent_id = %self.agent_id,
app = %app,
"permission gate: no emitter wired AND headless_auto_allow=false; denying"
);
return Ok(Some(DriverOutcome::PermissionDenied));
};
let request_id = format!(
"{}-{}",
self.agent_id,
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0)
);
let req = PermissionRequest {
request_id: request_id.clone(),
agent_id: self.agent_id.clone(),
app: app.clone(),
reason: format!(
"Run a GUI agent loop on {}: \"{}\"",
if app.is_empty() {
self.operator.name()
} else {
app.as_str()
},
truncate(instruction, 200)
),
estimated_steps: self.max_loop,
};
emit(req);
let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(60);
let mut delay = std::time::Duration::from_millis(200);
loop {
if tokio::time::Instant::now() >= deadline {
warn!(
agent = %self.agent_id,
app = %app,
"permission request timed out"
);
return Ok(Some(DriverOutcome::PermissionDenied));
}
tokio::time::sleep(delay).await;
if self.abort.load(Ordering::SeqCst) {
return Ok(Some(DriverOutcome::UserAbort { steps: 0 }));
}
match self.permission.check(&self.agent_id, &app).await? {
Some(PermissionDecision::Deny) => {
return Ok(Some(DriverOutcome::PermissionDenied));
}
Some(_) => return Ok(None),
None => {
delay = (delay * 2).min(std::time::Duration::from_secs(2));
}
}
}
}
}
}
fn emit_status(&self, ev: ComputerUseStatus) {
if let Some(emit) = self.status_emit.as_ref() {
emit(ev);
}
}
fn emit_started(&self, instruction: &str) {
self.emit_status(ComputerUseStatus::Started {
run_id: self.run_id.clone(),
agent_id: self.agent_id.clone(),
app: self.app.clone(),
instruction: truncate(instruction, 200),
max_steps: self.max_loop,
});
}
fn emit_thinking(&self, step_index: usize) {
self.emit_status(ComputerUseStatus::Thinking {
run_id: self.run_id.clone(),
step_index,
});
}
fn emit_step(&self, step_index: usize, step: &Step) {
self.emit_status(ComputerUseStatus::Step {
run_id: self.run_id.clone(),
step_index,
action_summary: step.action_summary.clone(),
thought: truncate(&step.thought, 200),
result_ok: step.result_ok,
result_message: step.result_message.as_deref().map(|m| truncate(m, 120)),
});
}
fn emit_finished(&self, outcome: &DriverOutcome) {
let (kind, steps, summary) = match outcome {
DriverOutcome::Finished { content, steps } => {
("finished", *steps, truncate(content, 200))
}
DriverOutcome::CallUser { reason, steps } => {
("call_user", *steps, truncate(reason, 200))
}
DriverOutcome::MaxLoop { steps } => ("max_loop", *steps, String::new()),
DriverOutcome::UserAbort { steps } => ("user_abort", *steps, String::new()),
DriverOutcome::PermissionDenied => ("permission_denied", 0, String::new()),
DriverOutcome::OperatorError { message, steps } => {
("operator_error", *steps, truncate(message, 200))
}
};
self.emit_status(ComputerUseStatus::Finished {
run_id: self.run_id.clone(),
outcome_kind: kind.to_owned(),
steps,
summary,
});
}
}
fn build_user_message(instruction: &str, history: &[Step]) -> String {
if history.is_empty() {
return format!("Task: {instruction}");
}
let mut s = String::with_capacity(512 + history.len() * 64);
s.push_str("Task: ");
s.push_str(instruction);
s.push_str("\n\nHistory (most recent last):\n");
let tail = if history.len() > 10 {
&history[history.len() - 10..]
} else {
history
};
for (i, step) in tail.iter().enumerate() {
s.push_str(&format!("{}. {}", i + 1, step.action_summary));
if let Some(msg) = step.result_message.as_deref() {
if !msg.is_empty() {
s.push_str(&format!(" → {}", truncate(msg, 80)));
}
}
s.push('\n');
}
s
}
fn truncate(s: &str, n: usize) -> String {
if s.chars().count() <= n {
s.to_owned()
} else {
let mut out: String = s.chars().take(n).collect();
out.push('…');
out
}
}
fn summarize_parsed(p: &ParsedAction) -> String {
let pretty_args = p
.raw_args
.iter()
.map(|(k, v)| format!("{k}={}", truncate(v, 40)))
.collect::<Vec<_>>()
.join(", ");
format!("{}({pretty_args})", p.action_type)
}
fn terminal_action_text(p: &ParsedAction, keys: &[&str]) -> Option<String> {
keys.iter()
.find_map(|key| p.raw_args.get(*key))
.filter(|value| !value.trim().is_empty())
.cloned()
}
async fn verify_finished_claim(
provider: &dyn LlmProvider,
model_name: &str,
instruction: &str,
history: &[Step],
thought: &str,
content: &str,
snap_b64: &str,
abort: &AtomicBool,
) -> bool {
let history_text = if history.is_empty() {
"(no previous actions)".to_owned()
} else {
history
.iter()
.enumerate()
.map(|(idx, step)| {
format!(
"{}. action={} ok={} message={}",
idx + 1,
step.action_summary,
step.result_ok,
step.result_message.as_deref().unwrap_or("")
)
})
.collect::<Vec<_>>()
.join("\n")
};
let req = LlmRequest {
fallback_models: Vec::new(),
model: model_name.to_owned(),
messages: vec![Message {
role: Role::User,
content: MessageContent::Parts(vec![
ContentPart::Text {
text: format!(
"User instruction:\n{instruction}\n\nAction history:\n{history_text}\n\nThe GUI agent now wants to stop with:\nThought: {thought}\nfinished(content='{content}')\n\nLook only at the CURRENT screenshot. Does it prove the user's requested end state is fully achieved? Reply with exactly one line starting with YES or NO, then a short reason. If the screenshot does not prove success, reply NO."
),
},
ContentPart::Image {
url: format!("data:image/png;base64,{snap_b64}"),
},
]),
rsclaw_hidden: None,
}],
tools: Vec::new(),
system: Some(
"You are a strict verifier for a desktop GUI automation run. Approve completion only when the current screenshot visibly proves the user's exact requested goal is done. Submitted-but-not-confirmed, missing target app, loading states, errors, ambiguity, or lack of visible proof must be NO."
.to_owned(),
),
max_tokens: Some(256),
temperature: Some(0.0),
frequency_penalty: None,
thinking_budget: None,
endpoint: AgentEndpoint::Vision,
kv_cache_mode: 0,
session_key: None,
system_shared: None,
user_system: None,
recall: None,
};
match stream_prediction(provider, req, abort).await {
Ok(verdict) => {
let normalized = verdict.trim_start().to_ascii_lowercase();
let ok = normalized.starts_with("yes");
if !ok {
info!(
verdict = %verdict.chars().take(240).collect::<String>(),
"VlmDriver: finished claim rejected"
);
}
ok
}
Err(e) => {
warn!(error = %e, "VlmDriver: finished verification failed");
false
}
}
}
const STREAM_ABORTED: &str = "vlm stream: aborted by user";
async fn stream_prediction(
provider: &dyn LlmProvider,
req: LlmRequest,
abort: &AtomicBool,
) -> Result<String> {
let mut stream = provider
.stream(req)
.await
.context("provider.stream() failed to start")?;
let mut text = String::new();
let mut reasoning = String::new();
while let Some(event) = stream.next().await {
if abort.load(Ordering::SeqCst) {
anyhow::bail!(STREAM_ABORTED);
}
match event? {
StreamEvent::TextDelta(d) => text.push_str(&d),
StreamEvent::ReasoningDelta(d) => reasoning.push_str(&d),
StreamEvent::ToolCall { .. } => {} StreamEvent::Done { .. } => break,
StreamEvent::Error(e) => anyhow::bail!("VLM stream error: {e}"),
}
}
Ok(if text.trim().is_empty() {
reasoning
} else {
text
})
}
fn parsed_to_action(p: &ParsedAction, screen_w: u32, screen_h: u32) -> Option<Action> {
let scale = |c: (f32, f32)| -> (i32, i32) {
let (x, y) = c;
(
(x * screen_w as f32 / 1000.0).round() as i32,
(y * screen_h as f32 / 1000.0).round() as i32,
)
};
let start_xy = p.start.map(scale);
let end_xy = p.end.map(scale);
let raw = &p.raw_args;
match p.action_type.as_str() {
"click" | "left_click" | "left_single" | "tap" => {
let (x, y) = start_xy?;
Some(Action::Click {
x,
y,
button: MouseButton::Left,
})
}
"right_click" | "right_single" => {
let (x, y) = start_xy?;
Some(Action::Click {
x,
y,
button: MouseButton::Right,
})
}
"middle_click" => {
let (x, y) = start_xy?;
Some(Action::Click {
x,
y,
button: MouseButton::Middle,
})
}
"left_double" | "double_click" => {
let (x, y) = start_xy?;
Some(Action::DoubleClick { x, y })
}
"mouse_move" | "hover" => {
let (x, y) = start_xy?;
Some(Action::MouseMove { x, y })
}
"drag" | "swipe" | "left_click_drag" | "select" => {
let (a, b) = start_xy?;
let (c, d) = end_xy?;
Some(Action::Drag {
from_x: a,
from_y: b,
to_x: c,
to_y: d,
})
}
"long_press" => {
let (x, y) = start_xy?;
Some(Action::Click {
x,
y,
button: MouseButton::Left,
})
}
"scroll" => {
let (x, y) = start_xy.unwrap_or((screen_w as i32 / 2, screen_h as i32 / 2));
let dir = match raw.get("direction").map(String::as_str) {
Some("up") => ScrollDir::Up,
Some("down") => ScrollDir::Down,
Some("left") => ScrollDir::Left,
Some("right") => ScrollDir::Right,
_ => ScrollDir::Down,
};
let clicks = raw
.get("clicks")
.and_then(|s| s.parse::<i32>().ok())
.unwrap_or(3);
Some(Action::Scroll {
x,
y,
direction: dir,
clicks,
})
}
"type" => {
let text = raw.get("content").cloned().unwrap_or_default();
Some(Action::Type { text })
}
"hotkey" => {
let keys = raw
.get("key")
.or_else(|| raw.get("hotkey"))
.cloned()
.unwrap_or_default();
Some(Action::Hotkey { keys })
}
"press_home" => Some(Action::Hotkey {
keys: "press_home".to_owned(),
}),
"press_back" => Some(Action::Hotkey {
keys: "press_back".to_owned(),
}),
"activate_app" | "open_app" | "launch_app" => {
let app = raw
.get("app")
.or_else(|| raw.get("app_name"))
.or_else(|| raw.get("name"))
.cloned()
.unwrap_or_default();
Some(Action::ActivateApp { app })
}
"wait" => {
let seconds = raw
.get("seconds")
.and_then(|s| s.parse::<f32>().ok())
.unwrap_or(1.0);
Some(Action::Wait { seconds })
}
_ => None,
}
}
const _: fn() -> BTreeMap<String, String> = BTreeMap::new;
#[cfg(test)]
mod tests {
use super::*;
use crate::computer::action::ParsedAction;
fn pa(action_type: &str, args: &[(&str, &str)]) -> ParsedAction {
let mut raw_args = BTreeMap::new();
for (k, v) in args {
raw_args.insert((*k).to_owned(), (*v).to_owned());
}
ParsedAction {
thought: String::new(),
action_type: action_type.to_owned(),
raw_args,
start: None,
end: None,
}
}
#[test]
fn maps_click_top_left_corner() {
let mut p = pa("click", &[]);
p.start = Some((0.0, 0.0));
let a = parsed_to_action(&p, 2880, 1800).unwrap();
match a {
Action::Click { x, y, .. } => {
assert_eq!(x, 0);
assert_eq!(y, 0);
}
_ => panic!("wrong variant"),
}
}
#[test]
fn maps_click_centre_of_screen() {
let mut p = pa("click", &[]);
p.start = Some((500.0, 500.0));
let a = parsed_to_action(&p, 2880, 1800).unwrap();
match a {
Action::Click { x, y, .. } => {
assert_eq!(x, 1440);
assert_eq!(y, 900);
}
_ => panic!("wrong variant"),
}
}
#[test]
fn maps_click_bottom_right_corner() {
let mut p = pa("click", &[]);
p.start = Some((1000.0, 1000.0));
let a = parsed_to_action(&p, 1920, 1080).unwrap();
match a {
Action::Click { x, y, .. } => {
assert_eq!(x, 1920);
assert_eq!(y, 1080);
}
_ => panic!("wrong variant"),
}
}
#[test]
fn maps_click_arbitrary_point() {
let mut p = pa("click", &[]);
p.start = Some((40.0, 50.0));
let a = parsed_to_action(&p, 2880, 1800).unwrap();
match a {
Action::Click { x, y, .. } => {
assert_eq!(x, 115); assert_eq!(y, 90); }
_ => panic!("wrong variant"),
}
}
#[test]
fn maps_drag_with_both_endpoints() {
let mut p = pa("drag", &[]);
p.start = Some((100.0, 100.0));
p.end = Some((200.0, 200.0));
let a = parsed_to_action(&p, 1920, 1080).unwrap();
match a {
Action::Drag {
from_x,
from_y,
to_x,
to_y,
} => {
assert_eq!(from_x, 192);
assert_eq!(from_y, 108);
assert_eq!(to_x, 384);
assert_eq!(to_y, 216);
}
_ => panic!("wrong variant"),
}
}
#[test]
fn maps_type_action() {
let p = pa("type", &[("content", "hello world")]);
let a = parsed_to_action(&p, 1920, 1080).unwrap();
match a {
Action::Type { text } => assert_eq!(text, "hello world"),
_ => panic!("wrong variant"),
}
}
#[test]
fn terminal_action_text_prefers_reason_for_call_user() {
let p = pa("call_user", &[("reason", "login required")]);
assert_eq!(
terminal_action_text(&p, &["reason", "content"]).as_deref(),
Some("login required")
);
}
#[test]
fn terminal_action_text_keeps_content_for_finished() {
let p = pa("finished", &[("content", "sent")]);
assert_eq!(
terminal_action_text(&p, &["content"]).as_deref(),
Some("sent")
);
}
#[test]
fn maps_scroll_with_direction() {
let mut p = pa("scroll", &[("direction", "up"), ("clicks", "5")]);
p.start = Some((1000.0, 500.0));
let a = parsed_to_action(&p, 1920, 1080).unwrap();
match a {
Action::Scroll {
direction, clicks, ..
} => {
assert!(matches!(direction, ScrollDir::Up));
assert_eq!(clicks, 5);
}
_ => panic!("wrong variant"),
}
}
#[test]
fn unmapped_action_returns_none() {
let p = pa("teleport", &[]);
assert!(parsed_to_action(&p, 1920, 1080).is_none());
}
#[test]
fn build_user_message_with_history() {
let history = vec![
Step {
thought: String::new(),
action_summary: "click(start_box=...)".to_owned(),
result_ok: true,
result_message: None,
},
Step {
thought: String::new(),
action_summary: "type(content=hello)".to_owned(),
result_ok: false,
result_message: Some("not focused".to_owned()),
},
];
let msg = build_user_message("send a hi", &history);
assert!(msg.contains("Task: send a hi"));
assert!(msg.contains("1. click"));
assert!(msg.contains("2. type"));
assert!(msg.contains("not focused"));
}
#[test]
fn build_user_message_no_history() {
let msg = build_user_message("open WeChat", &[]);
assert_eq!(msg, "Task: open WeChat");
}
}