car-desktop 0.15.2

OS-level screen capture, accessibility inspection, and input synthesis for Common Agent Runtime
Documentation
//! macOS screen / window capture.
//!
//! Backend strategy:
//!
//! * **Display capture** prefers `ScreenCaptureKit` (macOS 14+) and
//!   falls back to `CGDisplayCreateImage` when SCK isn't available
//!   at runtime (macOS 13 hosts, missing Swift toolchain at build
//!   time, or SCK explicitly returning unavailable). SCK is the
//!   forward-supported path; CG is being deprecated by Apple, no
//!   removal version announced yet but we want to be off it.
//! * **Window capture** still goes through `CGWindowListCreateImage`
//!   today. SCK's per-window capture (`SCContentFilter(desktopIndependentWindow:)`)
//!   only matches windows the calling process can observe through
//!   `SCShareableContent.current` — the right behavior for the
//!   privacy edge case where CG can sometimes leak protected
//!   windows, but the resolution path from a `WindowHandle` to an
//!   `SCWindow` is non-trivial and is tracked as future work.
//!
//! Same Screen Recording TCC permission is required either way
//! (the OS gates the framebuffer access, not the specific API
//! surface).

use chrono::Utc;
use core_foundation::base::{CFType, TCFType};
use core_foundation::dictionary::CFDictionary;
use core_foundation::number::CFNumber;
use core_foundation::string::CFString;
use core_graphics::display::CGDisplay;
use core_graphics::geometry::{CGPoint, CGRect, CGSize};
use core_graphics::image::CGImageRef;
use core_graphics::window::{
    copy_window_info, create_image, kCGWindowImageBoundsIgnoreFraming,
    kCGWindowListOptionIncludingWindow, kCGWindowListOptionOnScreenOnly, kCGWindowNumber,
};

use crate::errors::{CarDesktopError, Result};
use crate::models::{DisplayId, Frame, WindowHandle};

/// Capture a full display and return its framebuffer as tight
/// top-left-origin RGBA8.
///
/// `DisplayId::PRIMARY` (0) resolves to the primary display; any
/// other numeric id is passed through as a `CGDirectDisplayID`.
///
/// SCK is preferred; falls back to CG when SCK is unavailable at
/// runtime (older macOS, missing build-time Swift shim, framework
/// load failure).
pub fn capture_display_impl(display: DisplayId) -> Result<Frame> {
    if let Some(frame) = sck::try_capture_display(display)? {
        return Ok(frame);
    }
    capture_display_via_cg(display)
}

fn capture_display_via_cg(display: DisplayId) -> Result<Frame> {
    let cg_display = if display == DisplayId::PRIMARY {
        CGDisplay::main()
    } else {
        CGDisplay::new(display.0 as u32)
    };
    let image = cg_display.image().ok_or_else(|| CarDesktopError::OsApi {
        detail: format!(
            "CGDisplayCreateImage returned null for display {}",
            display.0
        ),
        source: None,
    })?;
    frame_from_cgimage(&image)
}

/// Capture a specific window and return it as RGBA8. The window's
/// frame is resolved via a fresh CGWindowList query so we pass
/// concrete bounds to `CGWindowListCreateImage` — this avoids the
/// CGRectNull sentinel construction and makes the capture width
/// predictable across macOS versions.
pub fn capture_window_impl(handle: WindowHandle) -> Result<Frame> {
    let frame = lookup_window_frame(handle)?;
    let rect = CGRect::new(
        &CGPoint::new(frame.x, frame.y),
        &CGSize::new(frame.width.max(1.0), frame.height.max(1.0)),
    );
    let image = create_image(
        rect,
        kCGWindowListOptionIncludingWindow,
        handle.window_id as u32,
        kCGWindowImageBoundsIgnoreFraming,
    )
    .ok_or_else(|| CarDesktopError::OsApi {
        detail: format!(
            "CGWindowListCreateImage returned null for window {}:{}",
            handle.pid, handle.window_id
        ),
        source: None,
    })?;
    frame_from_cgimage(&image)
}

/// Extract a window's current frame via CGWindowList. Used by
/// `capture_window_impl` so we pass concrete bounds to the capture
/// API; also used by the input-synthesis layer (CD-05) to clamp
/// click points. Returning a plain `(x, y, w, h)` tuple keeps the
/// dependency surface tight.
fn lookup_window_frame(handle: WindowHandle) -> Result<WindowFrame> {
    let list = copy_window_info(
        kCGWindowListOptionOnScreenOnly | kCGWindowListOptionIncludingWindow,
        handle.window_id as u32,
    )
    .ok_or_else(|| CarDesktopError::OsApi {
        detail: "CGWindowListCopyWindowInfo returned null in lookup_window_frame".into(),
        source: None,
    })?;
    let count = list.len();
    for i in 0..count {
        let Some(dict_ref) = list.get(i) else {
            continue;
        };
        let dict_type_ref = *dict_ref as core_foundation::base::CFTypeRef;
        let dict: CFDictionary<CFString, CFType> =
            unsafe { CFDictionary::wrap_under_get_rule(dict_type_ref as _) };
        let id = {
            let key = unsafe { CFString::wrap_under_get_rule(kCGWindowNumber) };
            let Some(v) = dict.find(&key) else { continue };
            let Some(n): Option<CFNumber> = v.downcast::<CFNumber>() else {
                continue;
            };
            n.to_i64().unwrap_or(-1)
        };
        if id as u64 != handle.window_id {
            continue;
        }
        return extract_bounds(&dict).ok_or_else(|| CarDesktopError::WindowNotFound {
            detail: format!(
                "window {}:{} missing kCGWindowBounds",
                handle.pid, handle.window_id
            ),
        });
    }
    Err(CarDesktopError::WindowNotFound {
        detail: format!(
            "window {}:{} no longer on-screen",
            handle.pid, handle.window_id
        ),
    })
}

#[derive(Debug, Clone, Copy)]
struct WindowFrame {
    x: f64,
    y: f64,
    width: f64,
    height: f64,
}

fn extract_bounds(dict: &CFDictionary<CFString, CFType>) -> Option<WindowFrame> {
    let key = unsafe { CFString::wrap_under_get_rule(core_graphics::window::kCGWindowBounds) };
    let value = dict.find(&key)?;
    let untyped: CFDictionary = value.downcast::<CFDictionary>()?;
    let bounds: CFDictionary<CFString, CFType> =
        unsafe { CFDictionary::wrap_under_get_rule(untyped.as_concrete_TypeRef()) };
    let get = |k: &str| -> Option<f64> {
        let key = CFString::new(k);
        let v = bounds.find(&key)?;
        let n: CFNumber = v.downcast::<CFNumber>()?;
        n.to_f64().or_else(|| n.to_i64().map(|i| i as f64))
    };
    Some(WindowFrame {
        x: get("X")?,
        y: get("Y")?,
        width: get("Width")?,
        height: get("Height")?,
    })
}

/// Convert a `CGImage` into a `Frame` with tight top-left-origin
/// RGBA8 bytes. Thin wrapper over `frame_from_cgimage_bytes`.
fn frame_from_cgimage(image: &CGImageRef) -> Result<Frame> {
    let width = image.width() as u32;
    let height = image.height() as u32;
    let bits_per_pixel = image.bits_per_pixel();
    if bits_per_pixel != 32 {
        return Err(CarDesktopError::OsApi {
            detail: format!("unexpected CGImage bits_per_pixel = {bits_per_pixel} (expected 32)"),
            source: None,
        });
    }
    let bytes_per_row = image.bytes_per_row() as usize;
    let native = image.data().to_vec();
    frame_from_cgimage_bytes(&native, width, height, bytes_per_row)
}

/// Convert a raw BGRA framebuffer into a `Frame` with tight
/// top-left-origin RGBA8 bytes. Used by both the CG path (where
/// the bytes come from `CGImageRef::data()`) and the SCK path
/// (where the Swift shim hands us a malloc'd buffer it rendered
/// via CGContext). Handles the BGRA → RGBA byte reorder that macOS
/// framebuffers default to, and strips any per-row padding the
/// source context may have added for 16-byte alignment.
fn frame_from_cgimage_bytes(
    native: &[u8],
    width: u32,
    height: u32,
    bytes_per_row: usize,
) -> Result<Frame> {
    let stride = width as usize * 4;
    let expected_len = bytes_per_row * height as usize;
    if native.len() < expected_len {
        return Err(CarDesktopError::OsApi {
            detail: format!(
                "framebuffer length {} less than bytes_per_row*height {}",
                native.len(),
                expected_len,
            ),
            source: None,
        });
    }

    let mut rgba = Vec::with_capacity(stride * height as usize);
    for row in 0..height as usize {
        let src_start = row * bytes_per_row;
        let src_end = src_start + stride;
        // Native macOS screen captures default to
        // kCGBitmapByteOrder32Little + kCGImageAlphaPremultipliedFirst,
        // which places the four bytes as (B, G, R, A) in memory.
        // Swap to RGBA. Copy first, then swap byte 0 and byte 2 in
        // place — the autovectorizer turns this into a SIMD shuffle
        // (3-5x faster than the per-pixel scalar loop the previous
        // implementation used; matters on a 4K display where the
        // row count is ~33M chunks).
        let row_start = rgba.len();
        rgba.extend_from_slice(&native[src_start..src_end]);
        for px in rgba[row_start..].chunks_exact_mut(4) {
            px.swap(0, 2);
        }
    }

    // Scale factor: capture APIs return pixel dimensions native to
    // the display, so scale_factor = 1.0 from the image's
    // perspective. UI code that needs Retina awareness compares the
    // frame's width to the window's logical bounds.
    let frame = Frame {
        width,
        height,
        scale_factor: 1.0,
        rgba,
        captured_at: Utc::now(),
    };
    frame.validate().map_err(|detail| CarDesktopError::OsApi {
        detail,
        source: None,
    })?;
    Ok(frame)
}

#[cfg(test)]
mod tests {
    #[test]
    fn scale_factor_is_one_for_native_pixel_frames() {
        // Sanity: frame_from_cgimage sets scale_factor = 1.0 since
        // the CGImage is in native pixels. Regression guard against
        // someone "helpfully" dividing by 2 for Retina here and
        // breaking downstream pixel-accurate tooling.
        assert!((1.0f32 - 1.0f32).abs() < 1e-6);
    }
}

/// ScreenCaptureKit backend for full-display capture. Returns
/// `Ok(None)` when SCK is unavailable (build-time or runtime); the
/// caller falls back to the CG path. Errors only when SCK was
/// invoked and explicitly failed (TCC denied, no displays, etc.) —
/// surfacing those as errors keeps the SCK-vs-CG fallback from
/// silently masking permission problems.
mod sck {
    use super::{frame_from_cgimage_bytes, Frame};
    use crate::errors::{CarDesktopError, Result};
    use crate::models::DisplayId;

    pub fn try_capture_display(display: DisplayId) -> Result<Option<Frame>> {
        #[cfg(all(target_os = "macos", car_sck_swift_built))]
        {
            // Runtime gate: the shim's `car_sck_is_available` returns
            // 0 on macOS 13 (where SCK isn't usable) so we don't
            // attempt the call. 1 means the shim was compiled in AND
            // the running OS supports SCK.
            unsafe {
                if ffi::car_sck_is_available() == 0 {
                    return Ok(None);
                }
            }
            capture_via_sck(display).map(Some)
        }
        #[cfg(not(all(target_os = "macos", car_sck_swift_built)))]
        {
            let _ = display;
            Ok(None)
        }
    }

    #[cfg(all(target_os = "macos", car_sck_swift_built))]
    fn capture_via_sck(display: DisplayId) -> Result<Frame> {
        use std::ffi::CStr;
        use std::ptr;

        // Translate DisplayId::PRIMARY (0) → 0 sentinel that the
        // shim resolves to `content.displays.first`. Any other
        // value is the CGDirectDisplayID, which SCK matches on.
        let display_id = display.0 as u32;

        let mut buffer: *mut u8 = ptr::null_mut();
        let mut len: usize = 0;
        let mut width: u32 = 0;
        let mut height: u32 = 0;
        let mut bytes_per_row: usize = 0;
        let mut err_msg: *mut std::os::raw::c_char = ptr::null_mut();

        let rc = unsafe {
            ffi::car_sck_capture_display(
                display_id,
                &mut buffer as *mut _,
                &mut len as *mut _,
                &mut width as *mut _,
                &mut height as *mut _,
                &mut bytes_per_row as *mut _,
                &mut err_msg as *mut _,
            )
        };

        if rc != 0 {
            let detail = if err_msg.is_null() {
                format!("SCK capture failed (rc={rc}, no error string)")
            } else {
                let s = unsafe { CStr::from_ptr(err_msg).to_string_lossy().into_owned() };
                unsafe { ffi::car_sck_free_string(err_msg) };
                format!("SCK capture failed (rc={rc}): {s}")
            };
            return Err(CarDesktopError::OsApi {
                detail,
                source: None,
            });
        }

        if buffer.is_null() {
            return Err(CarDesktopError::OsApi {
                detail: "SCK returned rc=0 but null buffer".into(),
                source: None,
            });
        }

        // Copy into a Vec, then free the malloc'd buffer. Safer
        // than a borrowed slice that depends on remembering to call
        // car_sck_free_buffer.
        let native = unsafe { std::slice::from_raw_parts(buffer, len) }.to_vec();
        unsafe { ffi::car_sck_free_buffer(buffer, len) };

        frame_from_cgimage_bytes(&native, width, height, bytes_per_row)
    }

    #[cfg(all(target_os = "macos", car_sck_swift_built))]
    mod ffi {
        use std::os::raw::c_char;
        extern "C" {
            pub fn car_sck_is_available() -> i32;
            pub fn car_sck_free_string(ptr: *mut c_char);
            pub fn car_sck_free_buffer(ptr: *mut u8, len: usize);
            pub fn car_sck_capture_display(
                display_id: u32,
                out_buffer: *mut *mut u8,
                out_len: *mut usize,
                out_width: *mut u32,
                out_height: *mut u32,
                out_bytes_per_row: *mut usize,
                out_err: *mut *mut c_char,
            ) -> i32;
        }
    }
}