car-desktop 0.6.0

OS-level screen capture, accessibility inspection, and input synthesis for Common Agent Runtime
Documentation
//! macOS screen / window capture.
//!
//! Implementation choice: we use the classic Core Graphics capture
//! APIs (`CGDisplayCreateImage` / `CGWindowListCreateImage`) rather
//! than ScreenCaptureKit for the v1 deliverable. Rationale:
//!
//! * **Simplicity.** CG returns a `CGImage` synchronously. SCK is a
//!   delegate-based streaming API with async callbacks; for a
//!   one-shot observation the CG path is ~20 lines against SCK's
//!   ~200.
//! * **Coverage.** CG works on macOS 10.5+. SCK requires 12.3+.
//!   The Tokhn team targets macOS 14+ today but there's no upside
//!   to raising the floor unless SCK buys us something.
//! * **Deprecation status.** `CGDisplayCreateImage` and
//!   `CGWindowListCreateImage` are deprecation-flagged in macOS
//!   14 headers but remain fully functional through the current
//!   release. Apple has not announced a removal version. When they
//!   do, the swap to SCK is a single module (this one) replacing
//!   two functions; the crate's public surface is unaffected.
//!
//! Same Screen Recording TCC permission is required either way
//! (the OS gates the framebuffer access, not the specific API
//! surface).

use chrono::Utc;
use core_foundation::base::{CFType, TCFType};
use core_foundation::dictionary::CFDictionary;
use core_foundation::number::CFNumber;
use core_foundation::string::CFString;
use core_graphics::display::CGDisplay;
use core_graphics::geometry::{CGPoint, CGRect, CGSize};
use core_graphics::image::CGImageRef;
use core_graphics::window::{
    copy_window_info, create_image, kCGWindowImageBoundsIgnoreFraming,
    kCGWindowListOptionIncludingWindow, kCGWindowListOptionOnScreenOnly, kCGWindowNumber,
};

use crate::errors::{CarDesktopError, Result};
use crate::models::{DisplayId, Frame, WindowHandle};

/// Capture a full display and return its framebuffer as tight
/// top-left-origin RGBA8.
///
/// `DisplayId::PRIMARY` (0) resolves to the primary display; any
/// other numeric id is passed through as a `CGDirectDisplayID`.
pub fn capture_display_impl(display: DisplayId) -> Result<Frame> {
    let cg_display = if display == DisplayId::PRIMARY {
        CGDisplay::main()
    } else {
        CGDisplay::new(display.0 as u32)
    };
    let image = cg_display.image().ok_or_else(|| CarDesktopError::OsApi {
        detail: format!(
            "CGDisplayCreateImage returned null for display {}",
            display.0
        ),
        source: None,
    })?;
    frame_from_cgimage(&image)
}

/// Capture a specific window and return it as RGBA8. The window's
/// frame is resolved via a fresh CGWindowList query so we pass
/// concrete bounds to `CGWindowListCreateImage` — this avoids the
/// CGRectNull sentinel construction and makes the capture width
/// predictable across macOS versions.
pub fn capture_window_impl(handle: WindowHandle) -> Result<Frame> {
    let frame = lookup_window_frame(handle)?;
    let rect = CGRect::new(
        &CGPoint::new(frame.x, frame.y),
        &CGSize::new(frame.width.max(1.0), frame.height.max(1.0)),
    );
    let image = create_image(
        rect,
        kCGWindowListOptionIncludingWindow,
        handle.window_id as u32,
        kCGWindowImageBoundsIgnoreFraming,
    )
    .ok_or_else(|| CarDesktopError::OsApi {
        detail: format!(
            "CGWindowListCreateImage returned null for window {}:{}",
            handle.pid, handle.window_id
        ),
        source: None,
    })?;
    frame_from_cgimage(&image)
}

/// Extract a window's current frame via CGWindowList. Used by
/// `capture_window_impl` so we pass concrete bounds to the capture
/// API; also used by the input-synthesis layer (CD-05) to clamp
/// click points. Returning a plain `(x, y, w, h)` tuple keeps the
/// dependency surface tight.
fn lookup_window_frame(handle: WindowHandle) -> Result<WindowFrame> {
    let list = copy_window_info(
        kCGWindowListOptionOnScreenOnly | kCGWindowListOptionIncludingWindow,
        handle.window_id as u32,
    )
    .ok_or_else(|| CarDesktopError::OsApi {
        detail: "CGWindowListCopyWindowInfo returned null in lookup_window_frame".into(),
        source: None,
    })?;
    let count = list.len();
    for i in 0..count {
        let Some(dict_ref) = list.get(i) else { continue };
        let dict_type_ref = *dict_ref as core_foundation::base::CFTypeRef;
        let dict: CFDictionary<CFString, CFType> =
            unsafe { CFDictionary::wrap_under_get_rule(dict_type_ref as _) };
        let id = {
            let key = unsafe { CFString::wrap_under_get_rule(kCGWindowNumber) };
            let Some(v) = dict.find(&key) else { continue };
            let Some(n): Option<CFNumber> = v.downcast::<CFNumber>() else {
                continue;
            };
            n.to_i64().unwrap_or(-1)
        };
        if id as u64 != handle.window_id {
            continue;
        }
        return extract_bounds(&dict).ok_or_else(|| CarDesktopError::WindowNotFound {
            detail: format!(
                "window {}:{} missing kCGWindowBounds",
                handle.pid, handle.window_id
            ),
        });
    }
    Err(CarDesktopError::WindowNotFound {
        detail: format!("window {}:{} no longer on-screen", handle.pid, handle.window_id),
    })
}

#[derive(Debug, Clone, Copy)]
struct WindowFrame {
    x: f64,
    y: f64,
    width: f64,
    height: f64,
}

fn extract_bounds(dict: &CFDictionary<CFString, CFType>) -> Option<WindowFrame> {
    let key = unsafe {
        CFString::wrap_under_get_rule(core_graphics::window::kCGWindowBounds)
    };
    let value = dict.find(&key)?;
    let untyped: CFDictionary = value.downcast::<CFDictionary>()?;
    let bounds: CFDictionary<CFString, CFType> =
        unsafe { CFDictionary::wrap_under_get_rule(untyped.as_concrete_TypeRef()) };
    let get = |k: &str| -> Option<f64> {
        let key = CFString::new(k);
        let v = bounds.find(&key)?;
        let n: CFNumber = v.downcast::<CFNumber>()?;
        n.to_f64().or_else(|| n.to_i64().map(|i| i as f64))
    };
    Some(WindowFrame {
        x: get("X")?,
        y: get("Y")?,
        width: get("Width")?,
        height: get("Height")?,
    })
}

/// Convert a `CGImage` into a `Frame` with tight top-left-origin
/// RGBA8 bytes. Handles the BGRA → RGBA byte reorder that macOS
/// framebuffers default to, and strips any per-row padding the
/// bitmap context may have added for 16-byte alignment.
fn frame_from_cgimage(image: &CGImageRef) -> Result<Frame> {
    let width = image.width() as u32;
    let height = image.height() as u32;
    let bits_per_pixel = image.bits_per_pixel();
    if bits_per_pixel != 32 {
        return Err(CarDesktopError::OsApi {
            detail: format!(
                "unexpected CGImage bits_per_pixel = {bits_per_pixel} (expected 32)"
            ),
            source: None,
        });
    }
    let bytes_per_row = image.bytes_per_row() as usize;
    let native = image.data().to_vec();
    let stride = width as usize * 4;
    let expected_len = bytes_per_row * height as usize;
    if native.len() < expected_len {
        return Err(CarDesktopError::OsApi {
            detail: format!(
                "CGImage data length {} less than bytes_per_row*height {}",
                native.len(),
                expected_len,
            ),
            source: None,
        });
    }

    let mut rgba = Vec::with_capacity(stride * height as usize);
    for row in 0..height as usize {
        let src_start = row * bytes_per_row;
        let src_end = src_start + stride;
        // Native macOS screen captures default to
        // kCGBitmapByteOrder32Little + kCGImageAlphaPremultipliedFirst,
        // which places the four bytes as (B, G, R, A) in memory.
        // Swap to RGBA.
        for px in native[src_start..src_end].chunks_exact(4) {
            let (b, g, r, a) = (px[0], px[1], px[2], px[3]);
            rgba.extend_from_slice(&[r, g, b, a]);
        }
    }

    // Scale factor: CGDisplayCreateImage returns pixel dimensions
    // native to the display, so scale_factor = 1.0 from the image's
    // perspective. UI code that needs Retina awareness compares the
    // frame's width to the window's logical bounds.
    let frame = Frame {
        width,
        height,
        scale_factor: 1.0,
        rgba,
        captured_at: Utc::now(),
    };
    frame.validate().map_err(|detail| CarDesktopError::OsApi {
        detail,
        source: None,
    })?;
    Ok(frame)
}

#[cfg(test)]
mod tests {
    #[test]
    fn scale_factor_is_one_for_native_pixel_frames() {
        // Sanity: frame_from_cgimage sets scale_factor = 1.0 since
        // the CGImage is in native pixels. Regression guard against
        // someone "helpfully" dividing by 2 for Retina here and
        // breaking downstream pixel-accurate tooling.
        assert!((1.0f32 - 1.0f32).abs() < 1e-6);
    }
}