1#![allow(clippy::result_large_err)]
2use std::path::{Path, PathBuf};
3use std::process::Stdio;
4
5use base64::Engine;
6use chrono::Utc;
7use desktop_core::{
8 ActionReceipt, ActionRequest, ArtifactRef, CursorPosition, MouseButton, Observation,
9 ScreenshotData, StructuredError, WindowMetadata,
10};
11use serde_json::{Value, json};
12use tokio::fs;
13use tokio::process::Command;
14
15#[derive(Debug, Clone)]
16pub struct BackendOptions {
17 pub display: String,
18 pub artifacts_dir: PathBuf,
19 pub browser_command: String,
20 pub session_env: Vec<(String, String)>,
21}
22
23#[derive(Debug, Clone)]
24pub struct LinuxBackend {
25 options: BackendOptions,
26}
27
28impl LinuxBackend {
29 pub fn new(options: BackendOptions) -> Self {
30 Self { options }
31 }
32
33 pub fn display(&self) -> &str {
34 &self.options.display
35 }
36
37 pub fn artifacts_dir(&self) -> &Path {
38 &self.options.artifacts_dir
39 }
40
41 pub fn browser_command(&self) -> &str {
42 &self.options.browser_command
43 }
44
45 fn apply_display_env(&self, command: &mut Command) {
46 command.env("DISPLAY", &self.options.display);
47 for (key, value) in &self.options.session_env {
48 command.env(key, value);
49 }
50 }
51
52 pub fn capabilities(&self) -> Vec<String> {
53 let mut caps = vec![
54 "screenshot".to_string(),
55 "shell".to_string(),
56 "filesystem".to_string(),
57 ];
58 if Self::tool_exists("xdotool") {
59 caps.extend([
60 "mouse".to_string(),
61 "keyboard".to_string(),
62 "window_focus".to_string(),
63 "window_resize".to_string(),
64 ]);
65 }
66 if Self::tool_exists("xprop") {
67 caps.push("window_metadata".to_string());
68 }
69 if Self::tool_exists(&self.options.browser_command) {
70 caps.push("browser_open".to_string());
71 }
72 caps
73 }
74
75 pub async fn observation(&self) -> Result<Observation, StructuredError> {
76 let screenshot = self.capture_screenshot().await?;
77 let active_window = self.active_window().await.ok();
78 let cursor_position = self.cursor_position().await.ok();
79 let active_window_title = active_window
80 .as_ref()
81 .and_then(|window| window.title.clone());
82 Ok(Observation {
83 captured_at: Utc::now(),
84 capability_flags: self.capabilities(),
85 active_window,
86 cursor_position,
87 browser: None,
88 raw: json!({
89 "display": self.options.display,
90 }),
91 summary: json!({
92 "display": self.options.display,
93 "active_window": active_window_title,
94 }),
95 screenshot,
96 })
97 }
98
99 pub async fn screenshot_png(&self) -> Result<(Vec<u8>, PathBuf), StructuredError> {
100 let screenshot = self.capture_screenshot().await?;
101 let path = screenshot
102 .artifact_path
103 .clone()
104 .ok_or_else(|| self.io_error("screenshot artifact path missing".to_string()))?;
105 let bytes = fs::read(&path)
106 .await
107 .map_err(|error| self.io_error(error.to_string()))?;
108 Ok((bytes, PathBuf::from(path)))
109 }
110
111 pub async fn perform_action(&self, action: ActionRequest) -> ActionReceipt {
112 let started_at = Utc::now();
113 let action_name = action.action_name().to_string();
114 match self.perform_action_inner(action).await {
115 Ok((result, artifacts)) => {
116 ActionReceipt::success(&action_name, started_at, result, artifacts)
117 }
118 Err(error) => ActionReceipt::failure(&action_name, started_at, error),
119 }
120 }
121
122 async fn perform_action_inner(
123 &self,
124 action: ActionRequest,
125 ) -> Result<(Value, Vec<ArtifactRef>), StructuredError> {
126 match action {
127 ActionRequest::MouseMove { x, y, .. } => {
128 self.run_xdotool(["mousemove", &x.to_string(), &y.to_string()])
129 .await?;
130 Ok((json!({"x": x, "y": y}), vec![]))
131 }
132 ActionRequest::MouseClick { button, x, y, .. } => {
133 if let (Some(x), Some(y)) = (x, y) {
134 self.run_xdotool(["mousemove", &x.to_string(), &y.to_string()])
135 .await?;
136 }
137 let button_number = match button.unwrap_or(MouseButton::Left) {
138 MouseButton::Left => "1",
139 MouseButton::Middle => "2",
140 MouseButton::Right => "3",
141 };
142 self.run_xdotool(["click", button_number]).await?;
143 Ok((json!({"button": button_number}), vec![]))
144 }
145 ActionRequest::MouseDrag {
146 start_x,
147 start_y,
148 end_x,
149 end_y,
150 ..
151 } => {
152 self.run_xdotool(["mousemove", &start_x.to_string(), &start_y.to_string()])
153 .await?;
154 self.run_xdotool(["mousedown", "1"]).await?;
155 self.run_xdotool(["mousemove", &end_x.to_string(), &end_y.to_string()])
156 .await?;
157 self.run_xdotool(["mouseup", "1"]).await?;
158 Ok((
159 json!({"start": [start_x, start_y], "end": [end_x, end_y]}),
160 vec![],
161 ))
162 }
163 ActionRequest::KeyPress { key, .. } => {
164 self.run_xdotool(["key", &key]).await?;
165 Ok((json!({"key": key}), vec![]))
166 }
167 ActionRequest::TypeText { text, .. } => {
168 self.run_xdotool(["type", "--delay", "1", &text]).await?;
169 Ok((json!({"typed": text}), vec![]))
170 }
171 ActionRequest::Hotkey { keys, .. } => {
172 let joined = keys.join("+");
173 self.run_xdotool(["key", &joined]).await?;
174 Ok((json!({"keys": keys}), vec![]))
175 }
176 ActionRequest::Scroll {
177 delta_x: _,
178 delta_y,
179 ..
180 } => {
181 if delta_y == 0 {
182 return Err(self.unsupported(
183 "horizontal-only scroll is not supported by the xdotool fallback",
184 ));
185 }
186 let button = if delta_y > 0 { "4" } else { "5" };
187 let clicks = (delta_y.abs().max(1) / 120) + 1;
188 for _ in 0..clicks {
189 self.run_xdotool(["click", button]).await?;
190 }
191 Ok((
192 json!({"delta_y": delta_y, "emulated_clicks": clicks}),
193 vec![],
194 ))
195 }
196 ActionRequest::OpenApp { name, .. } => {
197 self.run_shell_background(&name).await?;
198 Ok((json!({"command": name}), vec![]))
199 }
200 ActionRequest::FocusWindow { window_id, .. } => {
201 self.run_xdotool(["windowactivate", &window_id]).await?;
202 Ok((json!({"window_id": window_id}), vec![]))
203 }
204 ActionRequest::ResizeWindow {
205 window_id, bounds, ..
206 } => {
207 self.run_xdotool([
208 "windowsize",
209 &window_id,
210 &bounds.width.to_string(),
211 &bounds.height.to_string(),
212 ])
213 .await?;
214 self.run_xdotool([
215 "windowmove",
216 &window_id,
217 &bounds.x.to_string(),
218 &bounds.y.to_string(),
219 ])
220 .await?;
221 Ok((json!({"window_id": window_id, "bounds": bounds}), vec![]))
222 }
223 ActionRequest::RunCommand {
224 command, cwd, env, ..
225 } => {
226 let mut cmd = Command::new("sh");
227 cmd.arg("-lc").arg(&command);
228 cmd.env("DISPLAY", &self.options.display);
229 if let Some(cwd) = cwd.as_ref() {
230 cmd.current_dir(cwd);
231 }
232 if let Some(env_map) = env {
233 for (key, value) in env_map {
234 cmd.env(key, value);
235 }
236 }
237 let output = cmd
238 .output()
239 .await
240 .map_err(|error| self.io_error(error.to_string()))?;
241 Ok((
242 json!({
243 "stdout": String::from_utf8_lossy(&output.stdout),
244 "stderr": String::from_utf8_lossy(&output.stderr),
245 "exit_code": output.status.code(),
246 }),
247 vec![],
248 ))
249 }
250 ActionRequest::ReadFile { path, .. } => {
251 let contents = fs::read_to_string(&path)
252 .await
253 .map_err(|error| self.io_error(error.to_string()))?;
254 Ok((json!({"path": path, "contents": contents}), vec![]))
255 }
256 ActionRequest::WriteFile { path, contents, .. } => {
257 if let Some(parent) = Path::new(&path).parent() {
258 fs::create_dir_all(parent)
259 .await
260 .map_err(|error| self.io_error(error.to_string()))?;
261 }
262 fs::write(&path, contents.as_bytes())
263 .await
264 .map_err(|error| self.io_error(error.to_string()))?;
265 Ok((
266 json!({"path": path, "bytes_written": contents.len()}),
267 vec![],
268 ))
269 }
270 ActionRequest::BrowserOpen { url, .. } => {
271 let escaped = url.replace('"', "\\\"").replace('\'', "'\\''");
272 self.run_shell_background(&format!(
273 "{} '{}'",
274 self.options.browser_command, escaped
275 ))
276 .await?;
277 Ok((json!({"url": url, "mode": "desktop_fallback"}), vec![]))
278 }
279 ActionRequest::BrowserGetDom { .. }
280 | ActionRequest::BrowserClick { .. }
281 | ActionRequest::BrowserType { .. }
282 | ActionRequest::BrowserScreenshot { .. } => Err(self.unsupported(
283 "browser-specialized actions are handled by the control-plane browser adapter",
284 )),
285 }
286 }
287
288 async fn capture_screenshot(&self) -> Result<ScreenshotData, StructuredError> {
289 self.ensure_tool("import")?;
290 fs::create_dir_all(&self.options.artifacts_dir)
291 .await
292 .map_err(|error| self.io_error(error.to_string()))?;
293 let screenshot_path = self
294 .options
295 .artifacts_dir
296 .join(format!("screenshot-{}.png", Utc::now().timestamp_millis()));
297 let mut command = Command::new("import");
298 command.args([
299 "-window",
300 "root",
301 screenshot_path.to_string_lossy().as_ref(),
302 ]);
303 self.apply_display_env(&mut command);
304 let output = command
305 .output()
306 .await
307 .map_err(|error| self.io_error(error.to_string()))?;
308 if !output.status.success() {
309 return Err(self.command_error(
310 "import",
311 String::from_utf8_lossy(&output.stderr).into_owned(),
312 ));
313 }
314 let data = fs::read(&screenshot_path)
315 .await
316 .map_err(|error| self.io_error(error.to_string()))?;
317 Ok(ScreenshotData {
318 mime_type: "image/png".to_string(),
319 data_base64: Some(base64::engine::general_purpose::STANDARD.encode(data)),
320 width: None,
321 height: None,
322 artifact_path: Some(screenshot_path.to_string_lossy().to_string()),
323 })
324 }
325
326 async fn active_window(&self) -> Result<WindowMetadata, StructuredError> {
327 self.ensure_tool("xdotool")?;
328 let id = self
329 .run_command_capture("xdotool", ["getactivewindow"])
330 .await?;
331 let title = self
332 .run_command_capture("xdotool", ["getactivewindow", "getwindowname"])
333 .await
334 .unwrap_or_default();
335 let class_name = if Self::tool_exists("xprop") {
336 self.run_command_capture("xprop", ["-id", id.trim(), "WM_CLASS"])
337 .await
338 .ok()
339 } else {
340 None
341 };
342 Ok(WindowMetadata {
343 id: Some(id.trim().to_string()),
344 title: Some(title.trim().to_string()).filter(|value| !value.is_empty()),
345 class_name: class_name.map(|value| value.trim().to_string()),
346 })
347 }
348
349 async fn cursor_position(&self) -> Result<CursorPosition, StructuredError> {
350 self.ensure_tool("xdotool")?;
351 let output = self
352 .run_command_capture("xdotool", ["getmouselocation", "--shell"])
353 .await?;
354 let mut x = 0;
355 let mut y = 0;
356 let mut screen = None;
357 for line in output.lines() {
358 if let Some(value) = line.strip_prefix("X=") {
359 x = value.parse().unwrap_or_default();
360 } else if let Some(value) = line.strip_prefix("Y=") {
361 y = value.parse().unwrap_or_default();
362 } else if let Some(value) = line.strip_prefix("SCREEN=") {
363 screen = Some(value.to_string());
364 }
365 }
366 Ok(CursorPosition { x, y, screen })
367 }
368
369 async fn run_xdotool<I, S>(&self, args: I) -> Result<(), StructuredError>
370 where
371 I: IntoIterator<Item = S>,
372 S: AsRef<str>,
373 {
374 self.ensure_tool("xdotool")?;
375 let rendered: Vec<String> = args
376 .into_iter()
377 .map(|value| value.as_ref().to_string())
378 .collect();
379 let mut command = Command::new("xdotool");
380 command.args(&rendered);
381 self.apply_display_env(&mut command);
382 let output = command
383 .output()
384 .await
385 .map_err(|error| self.io_error(error.to_string()))?;
386 if output.status.success() {
387 Ok(())
388 } else {
389 Err(self.command_error(
390 "xdotool",
391 String::from_utf8_lossy(&output.stderr).into_owned(),
392 ))
393 }
394 }
395
396 async fn run_shell_background(&self, command: &str) -> Result<(), StructuredError> {
397 let mut child = Command::new("sh");
398 child
399 .arg("-lc")
400 .arg(format!("{} >/dev/null 2>&1 &", command))
401 .stdout(Stdio::null())
402 .stderr(Stdio::null());
403 self.apply_display_env(&mut child);
404 child
405 .spawn()
406 .map_err(|error| self.io_error(error.to_string()))?;
407 Ok(())
408 }
409
410 async fn run_command_capture<I, S>(
411 &self,
412 binary: &str,
413 args: I,
414 ) -> Result<String, StructuredError>
415 where
416 I: IntoIterator<Item = S>,
417 S: AsRef<str>,
418 {
419 let rendered: Vec<String> = args
420 .into_iter()
421 .map(|value| value.as_ref().to_string())
422 .collect();
423 let mut command = Command::new(binary);
424 command.args(&rendered);
425 self.apply_display_env(&mut command);
426 let output = command
427 .output()
428 .await
429 .map_err(|error| self.io_error(error.to_string()))?;
430 if output.status.success() {
431 Ok(String::from_utf8_lossy(&output.stdout).into_owned())
432 } else {
433 Err(self.command_error(binary, String::from_utf8_lossy(&output.stderr).into_owned()))
434 }
435 }
436
437 fn ensure_tool(&self, tool: &str) -> Result<(), StructuredError> {
438 if Self::tool_exists(tool) {
439 Ok(())
440 } else {
441 Err(self.missing_tool(tool))
442 }
443 }
444
445 pub fn tool_exists(tool: &str) -> bool {
446 std::process::Command::new("sh")
447 .arg("-lc")
448 .arg(format!("command -v {} >/dev/null 2>&1", tool))
449 .status()
450 .map(|status| status.success())
451 .unwrap_or(false)
452 }
453
454 fn missing_tool(&self, tool: &str) -> StructuredError {
455 StructuredError {
456 code: "missing_tool".to_string(),
457 message: format!("Required system tool `{tool}` is not available in the sandbox."),
458 retryable: false,
459 category: "environment".to_string(),
460 details: json!({"tool": tool}),
461 artifact_refs: vec![],
462 }
463 }
464
465 fn command_error(&self, binary: &str, stderr: String) -> StructuredError {
466 StructuredError {
467 code: "command_failed".to_string(),
468 message: format!("Command `{binary}` failed."),
469 retryable: true,
470 category: "execution".to_string(),
471 details: json!({"binary": binary, "stderr": stderr}),
472 artifact_refs: vec![],
473 }
474 }
475
476 fn unsupported(&self, message: &str) -> StructuredError {
477 StructuredError {
478 code: "unsupported".to_string(),
479 message: message.to_string(),
480 retryable: false,
481 category: "unsupported".to_string(),
482 details: json!({}),
483 artifact_refs: vec![],
484 }
485 }
486
487 fn io_error(&self, message: String) -> StructuredError {
488 StructuredError {
489 code: "io_error".to_string(),
490 message,
491 retryable: false,
492 category: "io".to_string(),
493 details: json!({}),
494 artifact_refs: vec![],
495 }
496 }
497}
498
499#[cfg(test)]
500mod tests {
501 use super::*;
502
503 #[test]
504 fn capabilities_are_non_empty() {
505 let backend = LinuxBackend::new(BackendOptions {
506 display: ":99".to_string(),
507 artifacts_dir: PathBuf::from("artifacts/test"),
508 browser_command: "firefox".to_string(),
509 session_env: vec![],
510 });
511 assert!(backend.capabilities().contains(&"shell".to_string()));
512 }
513}