Skip to main content

punch_runtime/automation/
mod.rs

1//! Desktop automation: screenshots, OCR, and accessibility-based UI interaction.
2//!
3//! The [`AutomationBackend`] trait provides a platform-agnostic interface for
4//! desktop automation capabilities. Platform-specific implementations live behind
5//! `#[cfg]` gates.
6//!
7//! ## Platform support
8//!
9//! - **macOS** (primary): Full support via `screencapture` + System Events accessibility APIs.
10//! - **Linux** (best-effort): Screenshots via `scrot`/`import`, limited UI via `xdotool`.
11//! - **Windows** (best-effort): Screenshots via PowerShell, UI tools return "not yet implemented".
12
13pub mod common;
14
15pub use common::*;
16
17use async_trait::async_trait;
18use punch_types::{PunchError, PunchResult};
19
20/// Allowed accessibility attributes for `read_element_attribute`.
21///
22/// Attributes are used as unquoted identifiers in AppleScript, so they CANNOT
23/// be escaped — they must be validated against this allowlist.
24#[cfg(any(target_os = "macos", test))]
25const ALLOWED_ATTRIBUTES: &[&str] = &[
26    "value",
27    "name",
28    "role",
29    "role description",
30    "title",
31    "description",
32    "enabled",
33    "focused",
34    "position",
35    "size",
36    "selected",
37    "help",
38    "subrole",
39    "identifier",
40    "minimum value",
41    "maximum value",
42    "orientation",
43    "placeholder value",
44];
45
46/// Platform-agnostic desktop automation backend.
47#[async_trait]
48pub trait AutomationBackend: Send + Sync {
49    // ---- Vision ----
50
51    /// Capture a screenshot of the full screen or a specific window.
52    ///
53    /// If `window` is `Some`, captures only that window (matched by title).
54    /// Returns base64-encoded PNG data.
55    async fn screenshot(&self, window: Option<&str>) -> PunchResult<ScreenshotResult>;
56
57    /// Capture a screenshot of a specific UI region by bounds.
58    ///
59    /// If `element_id` is provided, captures the region of that element.
60    /// If `bounds` is provided, captures that exact rectangle (x, y, w, h).
61    async fn ui_screenshot(
62        &self,
63        element_id: Option<&str>,
64        bounds: Option<(i32, i32, u32, u32)>,
65    ) -> PunchResult<ScreenshotResult>;
66
67    /// Extract text from an app window using OCR.
68    ///
69    /// This is cheaper than a screenshot + vision model for text-heavy content.
70    async fn app_ocr(&self, app: &str) -> PunchResult<OcrResult>;
71
72    // ---- UI interaction ----
73
74    /// List all visible windows with their titles and owning apps.
75    async fn list_windows(&self) -> PunchResult<Vec<WindowInfo>>;
76
77    /// Query the accessibility tree for UI elements matching a selector.
78    async fn find_ui_elements(
79        &self,
80        app: &str,
81        selector: &UiSelector,
82    ) -> PunchResult<Vec<UiElement>>;
83
84    /// Click a UI element by its element ID (from `find_ui_elements`).
85    async fn click_element(&self, element_id: &str) -> PunchResult<()>;
86
87    /// Type text into a UI element by its element ID.
88    async fn type_text(&self, element_id: &str, text: &str) -> PunchResult<()>;
89
90    /// Read an accessibility attribute from a UI element.
91    async fn read_element_attribute(
92        &self,
93        element_id: &str,
94        attribute: &str,
95    ) -> PunchResult<String>;
96}
97
98/// Create the platform-appropriate automation backend.
99pub fn create_backend() -> Box<dyn AutomationBackend> {
100    #[cfg(target_os = "macos")]
101    {
102        Box::new(MacOsBackend::new())
103    }
104    #[cfg(target_os = "linux")]
105    {
106        Box::new(LinuxBackend)
107    }
108    #[cfg(target_os = "windows")]
109    {
110        Box::new(WindowsBackend)
111    }
112    #[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
113    {
114        Box::new(StubBackend)
115    }
116}
117
118// ---------------------------------------------------------------------------
119// Security helpers
120// ---------------------------------------------------------------------------
121
122/// Escape a string for safe interpolation into AppleScript double-quoted strings.
123#[cfg(any(target_os = "macos", test))]
124fn escape_applescript(s: &str) -> String {
125    s.replace('\\', "\\\\")
126        .replace('"', "\\\"")
127        .replace('\n', "\\n")
128        .replace('\r', "\\r")
129}
130
131/// Validate that a role filter contains only safe characters.
132/// Roles are used as unquoted identifiers in AppleScript.
133#[cfg(any(target_os = "macos", test))]
134fn validate_role_filter(role: &str) -> PunchResult<()> {
135    if role
136        .chars()
137        .all(|c| c.is_ascii_alphanumeric() || c == ' ' || c == '_')
138    {
139        Ok(())
140    } else {
141        Err(PunchError::Tool {
142            tool: "ui_find_elements".into(),
143            message: format!(
144                "invalid role filter: {role:?} — only letters, digits, spaces, and underscores allowed"
145            ),
146        })
147    }
148}
149
150/// Validate that an attribute name is in the allowlist.
151#[cfg(any(target_os = "macos", test))]
152fn validate_attribute(attribute: &str) -> PunchResult<()> {
153    if ALLOWED_ATTRIBUTES.contains(&attribute) {
154        Ok(())
155    } else {
156        Err(PunchError::Tool {
157            tool: "ui_read_attribute".into(),
158            message: format!(
159                "attribute {attribute:?} is not allowed. Allowed: {}",
160                ALLOWED_ATTRIBUTES.join(", ")
161            ),
162        })
163    }
164}
165
166/// Parse an element ID ("AppName:index") into (app_name, index).
167pub fn parse_element_id(element_id: &str, tool: &str) -> PunchResult<(String, String, usize)> {
168    let parts: Vec<&str> = element_id.splitn(3, ':').collect();
169    if parts.len() != 3 {
170        return Err(PunchError::Tool {
171            tool: tool.into(),
172            message: format!(
173                "invalid element_id format: {element_id:?} — expected \"AppName:role:index\""
174            ),
175        });
176    }
177    let app = parts[0];
178    if app.is_empty() {
179        return Err(PunchError::Tool {
180            tool: tool.into(),
181            message: format!("invalid element_id: empty app name in {element_id:?}"),
182        });
183    }
184    let role = parts[1];
185    if role.is_empty() {
186        return Err(PunchError::Tool {
187            tool: tool.into(),
188            message: format!("invalid element_id: empty role in {element_id:?}"),
189        });
190    }
191    let index: usize = parts[2].parse().map_err(|_| PunchError::Tool {
192        tool: tool.into(),
193        message: format!("invalid element_id index: {element_id:?} — index must be a number"),
194    })?;
195    Ok((app.to_string(), role.to_string(), index))
196}
197
198/// Extract just the app name from an element ID.
199pub fn extract_app_from_element_id(element_id: &str, tool: &str) -> PunchResult<String> {
200    parse_element_id(element_id, tool).map(|(app, _, _)| app)
201}
202
203// ---------------------------------------------------------------------------
204// macOS backend
205// ---------------------------------------------------------------------------
206
207#[cfg(target_os = "macos")]
208pub struct MacOsBackend {
209    /// Temporary directory for screenshot files.
210    tmp_dir: String,
211}
212
213#[cfg(target_os = "macos")]
214impl Default for MacOsBackend {
215    fn default() -> Self {
216        Self::new()
217    }
218}
219
220#[cfg(target_os = "macos")]
221impl MacOsBackend {
222    pub fn new() -> Self {
223        Self {
224            tmp_dir: std::env::temp_dir().to_string_lossy().into_owned(),
225        }
226    }
227
228    /// Run an osascript command and return stdout.
229    async fn run_osascript(&self, script: &str) -> PunchResult<String> {
230        let output = tokio::process::Command::new("osascript")
231            .arg("-e")
232            .arg(script)
233            .output()
234            .await
235            .map_err(|e| PunchError::Tool {
236                tool: "automation".into(),
237                message: format!("failed to run osascript: {e}"),
238            })?;
239
240        if !output.status.success() {
241            let stderr = String::from_utf8_lossy(&output.stderr);
242            // Check for common accessibility errors and provide helpful messages.
243            if stderr.contains("not allowed assistive access")
244                || stderr.contains("accessibility")
245                || stderr.contains("AXError")
246            {
247                return Err(PunchError::Tool {
248                    tool: "automation".into(),
249                    message: "Accessibility access required. Go to System Settings > Privacy & Security > Accessibility and enable the terminal app running Punch.".into(),
250                });
251            }
252            return Err(PunchError::Tool {
253                tool: "automation".into(),
254                message: format!("osascript failed: {}", stderr.trim()),
255            });
256        }
257
258        Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
259    }
260}
261
262#[cfg(target_os = "macos")]
263#[async_trait]
264impl AutomationBackend for MacOsBackend {
265    async fn screenshot(&self, window: Option<&str>) -> PunchResult<ScreenshotResult> {
266        use base64::Engine;
267
268        let path = format!(
269            "{}/punch_screenshot_{}.png",
270            self.tmp_dir,
271            uuid::Uuid::new_v4()
272        );
273
274        let mut cmd = tokio::process::Command::new("screencapture");
275        cmd.arg("-x") // no sound
276            .arg("-t")
277            .arg("png");
278
279        if let Some(win_title) = window {
280            // Get window ID by title, then capture that window.
281            let escaped = escape_applescript(win_title);
282            let script = format!(
283                r#"tell application "System Events" to get id of first window of (first application process whose name is "{escaped}") whose name contains "{escaped}""#
284            );
285            match self.run_osascript(&script).await {
286                Ok(window_id) => {
287                    cmd.arg("-l").arg(window_id.trim());
288                }
289                Err(_) => {
290                    // Fallback: try matching by window title directly.
291                    let script2 = format!(
292                        r#"tell application "System Events"
293set wList to every window of every application process whose name contains "{escaped}"
294if (count of wList) > 0 then
295    return id of item 1 of wList
296end if
297end tell"#
298                    );
299                    match self.run_osascript(&script2).await {
300                        Ok(wid) if !wid.is_empty() => {
301                            cmd.arg("-l").arg(wid.trim());
302                        }
303                        _ => {
304                            // Last resort: capture full screen.
305                        }
306                    }
307                }
308            }
309        }
310
311        cmd.arg(&path);
312
313        let output = cmd.output().await.map_err(|e| PunchError::Tool {
314            tool: "sys_screenshot".into(),
315            message: format!("failed to run screencapture: {e}"),
316        })?;
317
318        if !output.status.success() {
319            return Err(PunchError::Tool {
320                tool: "sys_screenshot".into(),
321                message: format!(
322                    "screencapture failed: {}",
323                    String::from_utf8_lossy(&output.stderr)
324                ),
325            });
326        }
327
328        // Read the file and check for blank screenshots (permission issue).
329        let data = tokio::fs::read(&path).await.map_err(|e| PunchError::Tool {
330            tool: "sys_screenshot".into(),
331            message: format!("failed to read screenshot file: {e}"),
332        })?;
333
334        // Clean up the temp file.
335        let _ = tokio::fs::remove_file(&path).await;
336
337        if data.len() < 1024 {
338            return Err(PunchError::Tool {
339                tool: "sys_screenshot".into(),
340                message: "Screenshot appears blank. Grant Screen Recording permission in System Settings > Privacy & Security > Screen Recording.".into(),
341            });
342        }
343
344        // Parse PNG header for dimensions.
345        let (width, height) = parse_png_dimensions(&data).unwrap_or((0, 0));
346
347        let png_base64 = base64::engine::general_purpose::STANDARD.encode(&data);
348
349        Ok(ScreenshotResult {
350            png_base64,
351            width,
352            height,
353        })
354    }
355
356    async fn ui_screenshot(
357        &self,
358        element_id: Option<&str>,
359        bounds: Option<(i32, i32, u32, u32)>,
360    ) -> PunchResult<ScreenshotResult> {
361        use base64::Engine;
362
363        let path = format!(
364            "{}/punch_ui_screenshot_{}.png",
365            self.tmp_dir,
366            std::process::id()
367        );
368
369        let mut cmd = tokio::process::Command::new("screencapture");
370        cmd.arg("-x").arg("-t").arg("png");
371
372        if let Some((x, y, w, h)) = bounds {
373            cmd.arg("-R").arg(format!("{x},{y},{w},{h}"));
374        } else if let Some(eid) = element_id {
375            // Get element bounds via accessibility, then capture that region.
376            let (app, role, index) = parse_element_id(eid, "ui_screenshot")?;
377            let escaped_app = escape_applescript(&app);
378            let script = format!(
379                r#"tell application "System Events" to tell process "{escaped_app}"
380set el to {role} {} of window 1
381set p to position of el
382set s to size of el
383return (item 1 of p as text) & "," & (item 2 of p as text) & "," & (item 1 of s as text) & "," & (item 2 of s as text)
384end tell"#,
385                index + 1 // AppleScript is 1-based
386            );
387            let bounds_str = self.run_osascript(&script).await?;
388            cmd.arg("-R").arg(bounds_str);
389        }
390
391        cmd.arg(&path);
392
393        let output = cmd.output().await.map_err(|e| PunchError::Tool {
394            tool: "ui_screenshot".into(),
395            message: format!("failed to run screencapture: {e}"),
396        })?;
397
398        if !output.status.success() {
399            return Err(PunchError::Tool {
400                tool: "ui_screenshot".into(),
401                message: format!(
402                    "screencapture failed: {}",
403                    String::from_utf8_lossy(&output.stderr)
404                ),
405            });
406        }
407
408        let data = tokio::fs::read(&path).await.map_err(|e| PunchError::Tool {
409            tool: "ui_screenshot".into(),
410            message: format!("failed to read screenshot file: {e}"),
411        })?;
412        let _ = tokio::fs::remove_file(&path).await;
413
414        if data.len() < 1024 {
415            return Err(PunchError::Tool {
416                tool: "ui_screenshot".into(),
417                message: "Screenshot appears blank. Grant Screen Recording permission.".into(),
418            });
419        }
420
421        let (width, height) = parse_png_dimensions(&data).unwrap_or((0, 0));
422        let png_base64 = base64::engine::general_purpose::STANDARD.encode(&data);
423
424        Ok(ScreenshotResult {
425            png_base64,
426            width,
427            height,
428        })
429    }
430
431    async fn app_ocr(&self, app: &str) -> PunchResult<OcrResult> {
432        use base64::Engine;
433
434        // First capture a screenshot of the app window.
435        let screenshot = self.screenshot(Some(app)).await?;
436
437        // Try macOS Vision framework via a Swift one-liner.
438        // Falls back to tesseract if Vision is unavailable.
439        let tmp_img = format!("{}/punch_ocr_{}.png", self.tmp_dir, std::process::id());
440        let img_data = base64::engine::general_purpose::STANDARD
441            .decode(&screenshot.png_base64)
442            .map_err(|e| PunchError::Tool {
443                tool: "app_ocr".into(),
444                message: format!("failed to decode screenshot: {e}"),
445            })?;
446        tokio::fs::write(&tmp_img, &img_data)
447            .await
448            .map_err(|e| PunchError::Tool {
449                tool: "app_ocr".into(),
450                message: format!("failed to write temp image: {e}"),
451            })?;
452
453        // Try tesseract first (widely available via homebrew).
454        let output = tokio::process::Command::new("tesseract")
455            .arg(&tmp_img)
456            .arg("stdout")
457            .output()
458            .await;
459
460        let _ = tokio::fs::remove_file(&tmp_img).await;
461
462        match output {
463            Ok(out) if out.status.success() => {
464                let text = String::from_utf8_lossy(&out.stdout).trim().to_string();
465                let confidence = if text.is_empty() { 0.0 } else { 0.7 };
466                if text.is_empty() {
467                    return Ok(OcrResult {
468                        text: String::new(),
469                        regions: vec![OcrRegion {
470                            text: String::new(),
471                            bounds: None,
472                            confidence: 0.0,
473                        }],
474                    });
475                }
476                Ok(OcrResult {
477                    text: text.clone(),
478                    regions: vec![OcrRegion {
479                        text,
480                        bounds: None,
481                        confidence,
482                    }],
483                })
484            }
485            _ => {
486                // Tesseract not available — return a helpful error.
487                Err(PunchError::Tool {
488                    tool: "app_ocr".into(),
489                    message: "OCR requires tesseract. Install it: brew install tesseract".into(),
490                })
491            }
492        }
493    }
494
495    async fn list_windows(&self) -> PunchResult<Vec<WindowInfo>> {
496        let script = r#"tell application "System Events"
497set windowList to ""
498repeat with proc in (every application process whose background only is false)
499    set procName to name of proc
500    repeat with win in (every window of proc)
501        set winTitle to name of win
502        set winPos to position of win
503        set winSize to size of win
504        set winMin to false
505        try
506            set winMin to value of attribute "AXMinimized" of win
507        end try
508        set windowList to windowList & procName & "|||" & winTitle & "|||" & (item 1 of winPos as text) & "," & (item 2 of winPos as text) & "|||" & (item 1 of winSize as text) & "," & (item 2 of winSize as text) & "|||" & (winMin as text) & linefeed
509    end repeat
510end repeat
511return windowList
512end tell"#;
513
514        let result = self.run_osascript(script).await?;
515        let mut windows = Vec::new();
516
517        for line in result.lines() {
518            if line.trim().is_empty() {
519                continue;
520            }
521            let parts: Vec<&str> = line.split("|||").collect();
522            if parts.len() < 5 {
523                continue;
524            }
525            let position = parse_xy_pair(parts[2]);
526            let size = parse_wh_pair(parts[3]);
527            windows.push(WindowInfo {
528                app_name: parts[0].to_string(),
529                title: parts[1].to_string(),
530                position: position.map(|(x, y)| (x as i32, y as i32)),
531                size: size.map(|(w, h)| (w as u32, h as u32)),
532                is_minimized: parts[4].trim().eq_ignore_ascii_case("true"),
533            });
534        }
535
536        Ok(windows)
537    }
538
539    async fn find_ui_elements(
540        &self,
541        app: &str,
542        selector: &UiSelector,
543    ) -> PunchResult<Vec<UiElement>> {
544        let escaped_app = escape_applescript(app);
545
546        // Build the accessibility query. If a role is specified, query that role;
547        // otherwise query all "UI element" types.
548        let role_clause = if let Some(ref role) = selector.role {
549            validate_role_filter(role)?;
550            format!("every {role}")
551        } else {
552            "every UI element".to_string()
553        };
554
555        let script = format!(
556            r#"tell application "System Events" to tell process "{escaped_app}"
557set elements to {role_clause} of window 1
558set result to ""
559set idx to 0
560repeat with el in elements
561    set elRole to role of el
562    set elName to ""
563    try
564        set elName to name of el
565    end try
566    set elValue to ""
567    try
568        set elValue to value of el as text
569    end try
570    set elEnabled to true
571    try
572        set elEnabled to enabled of el
573    end try
574    set result to result & idx & "|||" & elRole & "|||" & elName & "|||" & elValue & "|||" & (elEnabled as text) & linefeed
575    set idx to idx + 1
576end repeat
577return result
578end tell"#
579        );
580
581        let result = self.run_osascript(&script).await.map_err(|e| {
582            PunchError::Tool {
583                tool: "ui_find_elements".into(),
584                message: format!(
585                    "No accessible elements found for {app}. This app may have limited accessibility support. Try sys_screenshot to visually inspect the window. (Error: {e})"
586                ),
587            }
588        })?;
589
590        let mut elements = Vec::new();
591        for line in result.lines() {
592            if line.trim().is_empty() {
593                continue;
594            }
595            let parts: Vec<&str> = line.split("|||").collect();
596            if parts.len() < 5 {
597                continue;
598            }
599
600            let label = if parts[2].is_empty() {
601                None
602            } else {
603                Some(parts[2].to_string())
604            };
605            let value = if parts[3].is_empty() {
606                None
607            } else {
608                Some(parts[3].to_string())
609            };
610
611            // Apply label/value filters from selector.
612            if let Some(ref filter_label) = selector.label
613                && !label
614                    .as_ref()
615                    .is_some_and(|l| l.to_lowercase().contains(&filter_label.to_lowercase()))
616            {
617                continue;
618            }
619            if let Some(ref filter_value) = selector.value
620                && !value
621                    .as_ref()
622                    .is_some_and(|v| v.to_lowercase().contains(&filter_value.to_lowercase()))
623            {
624                continue;
625            }
626
627            elements.push(UiElement {
628                element_id: format!("{}:{}:{}", app, parts[1].trim(), parts[0].trim()),
629                role: parts[1].to_string(),
630                label,
631                value,
632                enabled: parts[4].trim().eq_ignore_ascii_case("true"),
633            });
634        }
635
636        Ok(elements)
637    }
638
639    async fn click_element(&self, element_id: &str) -> PunchResult<()> {
640        let (app, role, index) = parse_element_id(element_id, "ui_click")?;
641        let escaped_app = escape_applescript(&app);
642        let applescript_index = index + 1; // AppleScript is 1-based
643
644        let script = format!(
645            r#"tell application "System Events" to tell process "{escaped_app}"
646click {role} {applescript_index} of window 1
647end tell"#
648        );
649
650        self.run_osascript(&script).await?;
651        Ok(())
652    }
653
654    async fn type_text(&self, element_id: &str, text: &str) -> PunchResult<()> {
655        let (app, role, index) = parse_element_id(element_id, "ui_type_text")?;
656        let escaped_app = escape_applescript(&app);
657        let escaped_text = escape_applescript(text);
658        let applescript_index = index + 1;
659
660        let script = format!(
661            r#"tell application "System Events" to tell process "{escaped_app}"
662set value of {role} {applescript_index} of window 1 to "{escaped_text}"
663end tell"#
664        );
665
666        self.run_osascript(&script).await?;
667        Ok(())
668    }
669
670    async fn read_element_attribute(
671        &self,
672        element_id: &str,
673        attribute: &str,
674    ) -> PunchResult<String> {
675        validate_attribute(attribute)?;
676        let (app, role, index) = parse_element_id(element_id, "ui_read_attribute")?;
677        let escaped_app = escape_applescript(&app);
678        let applescript_index = index + 1;
679
680        let script = format!(
681            r#"tell application "System Events" to tell process "{escaped_app}"
682return {attribute} of {role} {applescript_index} of window 1 as text
683end tell"#
684        );
685
686        self.run_osascript(&script).await
687    }
688}
689
690// ---------------------------------------------------------------------------
691// Linux backend (best-effort)
692// ---------------------------------------------------------------------------
693
694#[cfg(target_os = "linux")]
695pub struct LinuxBackend;
696
697#[cfg(target_os = "linux")]
698#[async_trait]
699impl AutomationBackend for LinuxBackend {
700    async fn screenshot(&self, _window: Option<&str>) -> PunchResult<ScreenshotResult> {
701        use base64::Engine;
702
703        let path = format!("/tmp/punch_screenshot_{}.png", std::process::id());
704
705        // Try scrot first, then import (ImageMagick).
706        let output = tokio::process::Command::new("scrot")
707            .arg(&path)
708            .output()
709            .await;
710
711        let ok = match output {
712            Ok(o) if o.status.success() => true,
713            _ => {
714                let import = tokio::process::Command::new("import")
715                    .arg("-window")
716                    .arg("root")
717                    .arg(&path)
718                    .output()
719                    .await;
720                matches!(import, Ok(o) if o.status.success())
721            }
722        };
723
724        if !ok {
725            return Err(PunchError::Tool {
726                tool: "sys_screenshot".into(),
727                message:
728                    "Screenshot requires scrot or ImageMagick. Install: sudo apt install scrot"
729                        .into(),
730            });
731        }
732
733        let data = tokio::fs::read(&path).await.map_err(|e| PunchError::Tool {
734            tool: "sys_screenshot".into(),
735            message: format!("failed to read screenshot: {e}"),
736        })?;
737        let _ = tokio::fs::remove_file(&path).await;
738
739        let (width, height) = parse_png_dimensions(&data).unwrap_or((0, 0));
740        let png_base64 = base64::engine::general_purpose::STANDARD.encode(&data);
741
742        Ok(ScreenshotResult {
743            png_base64,
744            width,
745            height,
746        })
747    }
748
749    async fn ui_screenshot(
750        &self,
751        _element_id: Option<&str>,
752        _bounds: Option<(i32, i32, u32, u32)>,
753    ) -> PunchResult<ScreenshotResult> {
754        Err(PunchError::Tool {
755            tool: "ui_screenshot".into(),
756            message: "UI region screenshot not yet implemented on Linux.".into(),
757        })
758    }
759
760    async fn app_ocr(&self, _app: &str) -> PunchResult<OcrResult> {
761        use base64::Engine;
762
763        // Capture full screen then OCR with tesseract.
764        let ss = self.screenshot(None).await?;
765        let tmp = format!("/tmp/punch_ocr_{}.png", std::process::id());
766        let data: Vec<u8> = base64::engine::general_purpose::STANDARD
767            .decode(&ss.png_base64)
768            .map_err(|e| PunchError::Tool {
769                tool: "app_ocr".into(),
770                message: format!("decode error: {e}"),
771            })?;
772        tokio::fs::write(&tmp, &data)
773            .await
774            .map_err(|e| PunchError::Tool {
775                tool: "app_ocr".into(),
776                message: format!("write error: {e}"),
777            })?;
778        let output = tokio::process::Command::new("tesseract")
779            .arg(&tmp)
780            .arg("stdout")
781            .output()
782            .await;
783        let _ = tokio::fs::remove_file(&tmp).await;
784        match output {
785            Ok(o) if o.status.success() => {
786                let text = String::from_utf8_lossy(&o.stdout).trim().to_string();
787                Ok(OcrResult {
788                    text: text.clone(),
789                    regions: vec![OcrRegion {
790                        text,
791                        bounds: None,
792                        confidence: 0.7,
793                    }],
794                })
795            }
796            _ => Err(PunchError::Tool {
797                tool: "app_ocr".into(),
798                message: "tesseract not found. Install: sudo apt install tesseract-ocr".into(),
799            }),
800        }
801    }
802
803    async fn list_windows(&self) -> PunchResult<Vec<WindowInfo>> {
804        let output = tokio::process::Command::new("wmctrl")
805            .arg("-l")
806            .output()
807            .await;
808        match output {
809            Ok(o) if o.status.success() => {
810                let stdout = String::from_utf8_lossy(&o.stdout);
811                let windows = stdout
812                    .lines()
813                    .filter_map(|line| {
814                        let parts: Vec<&str> = line.splitn(4, char::is_whitespace).collect();
815                        if parts.len() >= 4 {
816                            Some(WindowInfo {
817                                title: parts[3].to_string(),
818                                app_name: parts[3].to_string(),
819                                position: None,
820                                size: None,
821                                is_minimized: false,
822                            })
823                        } else {
824                            None
825                        }
826                    })
827                    .collect();
828                Ok(windows)
829            }
830            _ => Err(PunchError::Tool {
831                tool: "ui_list_windows".into(),
832                message: "wmctrl not found. Install: sudo apt install wmctrl".into(),
833            }),
834        }
835    }
836
837    async fn find_ui_elements(
838        &self,
839        _app: &str,
840        _selector: &UiSelector,
841    ) -> PunchResult<Vec<UiElement>> {
842        Err(PunchError::Tool { tool: "ui_find_elements".into(), message: "Accessibility tree query not yet implemented on Linux. Use sys_screenshot for visual inspection.".into() })
843    }
844
845    async fn click_element(&self, _element_id: &str) -> PunchResult<()> {
846        Err(PunchError::Tool {
847            tool: "ui_click".into(),
848            message: "UI click not yet implemented on Linux.".into(),
849        })
850    }
851
852    async fn type_text(&self, _element_id: &str, _text: &str) -> PunchResult<()> {
853        Err(PunchError::Tool {
854            tool: "ui_type_text".into(),
855            message: "UI type not yet implemented on Linux.".into(),
856        })
857    }
858
859    async fn read_element_attribute(
860        &self,
861        _element_id: &str,
862        _attribute: &str,
863    ) -> PunchResult<String> {
864        Err(PunchError::Tool {
865            tool: "ui_read_attribute".into(),
866            message: "Attribute reading not yet implemented on Linux.".into(),
867        })
868    }
869}
870
871// ---------------------------------------------------------------------------
872// Windows backend (best-effort)
873// ---------------------------------------------------------------------------
874
875#[cfg(target_os = "windows")]
876pub struct WindowsBackend;
877
878#[cfg(target_os = "windows")]
879#[async_trait]
880impl AutomationBackend for WindowsBackend {
881    async fn screenshot(&self, _window: Option<&str>) -> PunchResult<ScreenshotResult> {
882        Err(PunchError::Tool {
883            tool: "sys_screenshot".into(),
884            message: "Windows screenshot not yet implemented.".into(),
885        })
886    }
887    async fn ui_screenshot(
888        &self,
889        _element_id: Option<&str>,
890        _bounds: Option<(i32, i32, u32, u32)>,
891    ) -> PunchResult<ScreenshotResult> {
892        Err(PunchError::Tool {
893            tool: "ui_screenshot".into(),
894            message: "Windows UI screenshot not yet implemented.".into(),
895        })
896    }
897    async fn app_ocr(&self, _app: &str) -> PunchResult<OcrResult> {
898        Err(PunchError::Tool {
899            tool: "app_ocr".into(),
900            message: "Windows OCR not yet implemented.".into(),
901        })
902    }
903    async fn list_windows(&self) -> PunchResult<Vec<WindowInfo>> {
904        Err(PunchError::Tool {
905            tool: "ui_list_windows".into(),
906            message: "Windows list_windows not yet implemented.".into(),
907        })
908    }
909    async fn find_ui_elements(
910        &self,
911        _app: &str,
912        _selector: &UiSelector,
913    ) -> PunchResult<Vec<UiElement>> {
914        Err(PunchError::Tool {
915            tool: "ui_find_elements".into(),
916            message: "Windows UI automation not yet implemented.".into(),
917        })
918    }
919    async fn click_element(&self, _element_id: &str) -> PunchResult<()> {
920        Err(PunchError::Tool {
921            tool: "ui_click".into(),
922            message: "Windows UI click not yet implemented.".into(),
923        })
924    }
925    async fn type_text(&self, _element_id: &str, _text: &str) -> PunchResult<()> {
926        Err(PunchError::Tool {
927            tool: "ui_type_text".into(),
928            message: "Windows UI type not yet implemented.".into(),
929        })
930    }
931    async fn read_element_attribute(
932        &self,
933        _element_id: &str,
934        _attribute: &str,
935    ) -> PunchResult<String> {
936        Err(PunchError::Tool {
937            tool: "ui_read_attribute".into(),
938            message: "Windows attribute reading not yet implemented.".into(),
939        })
940    }
941}
942
943// ---------------------------------------------------------------------------
944// Stub backend (unsupported platforms)
945// ---------------------------------------------------------------------------
946
947#[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
948pub struct StubBackend;
949
950#[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
951#[async_trait]
952impl AutomationBackend for StubBackend {
953    async fn screenshot(&self, _window: Option<&str>) -> PunchResult<ScreenshotResult> {
954        Err(PunchError::Tool {
955            tool: "sys_screenshot".into(),
956            message: "Desktop automation not supported on this platform.".into(),
957        })
958    }
959    async fn ui_screenshot(
960        &self,
961        _element_id: Option<&str>,
962        _bounds: Option<(i32, i32, u32, u32)>,
963    ) -> PunchResult<ScreenshotResult> {
964        Err(PunchError::Tool {
965            tool: "ui_screenshot".into(),
966            message: "Desktop automation not supported on this platform.".into(),
967        })
968    }
969    async fn app_ocr(&self, _app: &str) -> PunchResult<OcrResult> {
970        Err(PunchError::Tool {
971            tool: "app_ocr".into(),
972            message: "Desktop automation not supported on this platform.".into(),
973        })
974    }
975    async fn list_windows(&self) -> PunchResult<Vec<WindowInfo>> {
976        Err(PunchError::Tool {
977            tool: "ui_list_windows".into(),
978            message: "Desktop automation not supported on this platform.".into(),
979        })
980    }
981    async fn find_ui_elements(
982        &self,
983        _app: &str,
984        _selector: &UiSelector,
985    ) -> PunchResult<Vec<UiElement>> {
986        Err(PunchError::Tool {
987            tool: "ui_find_elements".into(),
988            message: "Desktop automation not supported on this platform.".into(),
989        })
990    }
991    async fn click_element(&self, _element_id: &str) -> PunchResult<()> {
992        Err(PunchError::Tool {
993            tool: "ui_click".into(),
994            message: "Desktop automation not supported on this platform.".into(),
995        })
996    }
997    async fn type_text(&self, _element_id: &str, _text: &str) -> PunchResult<()> {
998        Err(PunchError::Tool {
999            tool: "ui_type_text".into(),
1000            message: "Desktop automation not supported on this platform.".into(),
1001        })
1002    }
1003    async fn read_element_attribute(
1004        &self,
1005        _element_id: &str,
1006        _attribute: &str,
1007    ) -> PunchResult<String> {
1008        Err(PunchError::Tool {
1009            tool: "ui_read_attribute".into(),
1010            message: "Desktop automation not supported on this platform.".into(),
1011        })
1012    }
1013}
1014
1015// ---------------------------------------------------------------------------
1016// Helpers
1017// ---------------------------------------------------------------------------
1018
1019/// Parse PNG IHDR chunk to extract width and height.
1020fn parse_png_dimensions(data: &[u8]) -> Option<(u32, u32)> {
1021    // PNG header: 8 bytes signature, then IHDR chunk.
1022    // IHDR starts at byte 8: 4 bytes length, 4 bytes "IHDR", then 4 bytes width, 4 bytes height.
1023    if data.len() < 24 {
1024        return None;
1025    }
1026    // Check PNG signature.
1027    if data[0..8] != [137, 80, 78, 71, 13, 10, 26, 10] {
1028        return None;
1029    }
1030    let width = u32::from_be_bytes([data[16], data[17], data[18], data[19]]);
1031    let height = u32::from_be_bytes([data[20], data[21], data[22], data[23]]);
1032    Some((width, height))
1033}
1034
1035/// Parse "x,y" into (i64, i64).
1036#[cfg(any(target_os = "macos", test))]
1037fn parse_xy_pair(s: &str) -> Option<(i64, i64)> {
1038    let parts: Vec<&str> = s.split(',').collect();
1039    if parts.len() == 2 {
1040        let x = parts[0].trim().parse().ok()?;
1041        let y = parts[1].trim().parse().ok()?;
1042        Some((x, y))
1043    } else {
1044        None
1045    }
1046}
1047
1048/// Parse "w,h" into (u64, u64).
1049#[cfg(any(target_os = "macos", test))]
1050fn parse_wh_pair(s: &str) -> Option<(u64, u64)> {
1051    let parts: Vec<&str> = s.split(',').collect();
1052    if parts.len() == 2 {
1053        let w = parts[0].trim().parse().ok()?;
1054        let h = parts[1].trim().parse().ok()?;
1055        Some((w, h))
1056    } else {
1057        None
1058    }
1059}
1060
1061#[cfg(test)]
1062mod tests {
1063    use super::*;
1064
1065    // ---- Security helper tests ----
1066
1067    #[test]
1068    fn test_escape_applescript_basic() {
1069        assert_eq!(escape_applescript(r#"hello"world"#), r#"hello\"world"#);
1070        assert_eq!(escape_applescript("line\nnewline"), "line\\nnewline");
1071        assert_eq!(escape_applescript(r"back\slash"), r"back\\slash");
1072        assert_eq!(escape_applescript("normal text"), "normal text");
1073    }
1074
1075    #[test]
1076    fn test_escape_applescript_empty() {
1077        assert_eq!(escape_applescript(""), "");
1078    }
1079
1080    #[test]
1081    fn test_escape_applescript_carriage_return() {
1082        assert_eq!(escape_applescript("foo\rbar"), "foo\\rbar");
1083    }
1084
1085    #[test]
1086    fn test_escape_applescript_all_special() {
1087        assert_eq!(escape_applescript("\\\"\n\r"), "\\\\\\\"\\n\\r");
1088    }
1089
1090    #[test]
1091    fn test_validate_role_filter_valid() {
1092        assert!(validate_role_filter("button").is_ok());
1093        assert!(validate_role_filter("text field").is_ok());
1094        assert!(validate_role_filter("UI element").is_ok());
1095        assert!(validate_role_filter("menu_item").is_ok());
1096        assert!(validate_role_filter("AXButton").is_ok());
1097    }
1098
1099    #[test]
1100    fn test_validate_role_filter_invalid() {
1101        assert!(validate_role_filter("button;rm -rf").is_err());
1102        assert!(validate_role_filter("foo\"bar").is_err());
1103        assert!(validate_role_filter("test\ninjection").is_err());
1104        assert!(validate_role_filter("$(whoami)").is_err());
1105    }
1106
1107    #[test]
1108    fn test_validate_attribute_valid() {
1109        assert!(validate_attribute("value").is_ok());
1110        assert!(validate_attribute("name").is_ok());
1111        assert!(validate_attribute("role description").is_ok());
1112        assert!(validate_attribute("placeholder value").is_ok());
1113    }
1114
1115    #[test]
1116    fn test_validate_attribute_invalid() {
1117        assert!(validate_attribute("hacked").is_err());
1118        assert!(validate_attribute("").is_err());
1119        assert!(validate_attribute("value; rm -rf /").is_err());
1120    }
1121
1122    // ---- Element ID parsing tests ----
1123
1124    #[test]
1125    fn test_parse_element_id_valid() {
1126        let (app, role, idx) = parse_element_id("Safari:button:3", "test").unwrap();
1127        assert_eq!(app, "Safari");
1128        assert_eq!(role, "button");
1129        assert_eq!(idx, 3);
1130    }
1131
1132    #[test]
1133    fn test_parse_element_id_zero_index() {
1134        let (app, role, idx) = parse_element_id("Messages:UI element:0", "test").unwrap();
1135        assert_eq!(app, "Messages");
1136        assert_eq!(role, "UI element");
1137        assert_eq!(idx, 0);
1138    }
1139
1140    #[test]
1141    fn test_parse_element_id_app_with_spaces() {
1142        let (app, role, idx) = parse_element_id("System Preferences:text field:5", "test").unwrap();
1143        assert_eq!(app, "System Preferences");
1144        assert_eq!(role, "text field");
1145        assert_eq!(idx, 5);
1146    }
1147
1148    #[test]
1149    fn test_parse_element_id_missing_colon() {
1150        assert!(parse_element_id("Safari3", "test").is_err());
1151    }
1152
1153    #[test]
1154    fn test_parse_element_id_empty_app() {
1155        assert!(parse_element_id(":button:3", "test").is_err());
1156    }
1157
1158    #[test]
1159    fn test_parse_element_id_non_numeric_index() {
1160        assert!(parse_element_id("Safari:button:abc", "test").is_err());
1161    }
1162
1163    #[test]
1164    fn test_parse_element_id_empty_string() {
1165        assert!(parse_element_id("", "test").is_err());
1166    }
1167
1168    #[test]
1169    fn test_parse_element_id_empty_role() {
1170        assert!(parse_element_id("Safari::3", "test").is_err());
1171    }
1172
1173    #[test]
1174    fn test_extract_app_from_element_id() {
1175        let app = extract_app_from_element_id("Messages:UI element:0", "test").unwrap();
1176        assert_eq!(app, "Messages");
1177    }
1178
1179    // ---- PNG dimension parsing tests ----
1180
1181    #[test]
1182    fn test_parse_png_dimensions_valid() {
1183        // Construct a minimal PNG header with known dimensions.
1184        let mut data = vec![137, 80, 78, 71, 13, 10, 26, 10]; // PNG signature
1185        data.extend_from_slice(&[0, 0, 0, 13]); // IHDR length
1186        data.extend_from_slice(b"IHDR"); // chunk type
1187        data.extend_from_slice(&1920u32.to_be_bytes()); // width
1188        data.extend_from_slice(&1080u32.to_be_bytes()); // height
1189
1190        let (w, h) = parse_png_dimensions(&data).unwrap();
1191        assert_eq!(w, 1920);
1192        assert_eq!(h, 1080);
1193    }
1194
1195    #[test]
1196    fn test_parse_png_dimensions_too_short() {
1197        assert!(parse_png_dimensions(&[0; 10]).is_none());
1198    }
1199
1200    #[test]
1201    fn test_parse_png_dimensions_bad_signature() {
1202        assert!(parse_png_dimensions(&[0; 30]).is_none());
1203    }
1204
1205    // ---- Coordinate parsing tests ----
1206
1207    #[test]
1208    fn test_parse_xy_pair() {
1209        assert_eq!(parse_xy_pair("100,200"), Some((100, 200)));
1210        assert_eq!(parse_xy_pair("-10, 50"), Some((-10, 50)));
1211        assert!(parse_xy_pair("abc,def").is_none());
1212        assert!(parse_xy_pair("100").is_none());
1213    }
1214
1215    #[test]
1216    fn test_parse_wh_pair() {
1217        assert_eq!(parse_wh_pair("1920,1080"), Some((1920, 1080)));
1218        assert!(parse_wh_pair("abc,100").is_none());
1219        assert!(parse_wh_pair("100").is_none());
1220    }
1221}