car-browser 0.32.0

//! Vision-augmented perception: accessibility tree + OCR fusion.
//!
//! The AX-only pipeline ([`super::pipeline::BasicPerceptionPipeline`]) is blind
//! to anything the accessibility tree omits — canvas/WebGL-rendered UIs, text
//! baked into images, and controls with an empty accessible name. This pipeline
//! runs OCR over the screenshot and fuses the result with the AX elements two
//! ways:
//!
//! - **Label recovery:** an interactable AX element with no accessible name
//!   adopts the label from an OCR text region sitting on it, and its source
//!   becomes [`ElementSource::Merged`]. The element's `ax_ref` (and therefore
//!   deterministic execution) is unchanged — only the human-readable handle is
//!   recovered.
//! - **Invisible text:** OCR text not already present in the AX tree is added
//!   as an [`TextSource::Ocr`] text block, so the agent can read canvas/image
//!   text it otherwise couldn't see.
//!
//! OCR cannot tell whether a pixel region is *clickable*, so this pipeline never
//! fabricates interactive elements out of OCR regions — doing so would invite
//! the agent to "click" non-controls. A future visual element detector is what
//! would legitimately populate [`ElementSource::VisualDetector`]; the merge slot
//! is already here for it.
//!
//! OCR is gated at runtime by [`car_vision::is_available`] (Apple Vision on
//! macOS, Tesseract CLI elsewhere). When no backend is present, or OCR fails,
//! the pipeline degrades cleanly to AX-only output.

use async_trait::async_trait;
use std::io::Write as _;

use super::ax_converter::AxConverter;
use super::pipeline::{extract_ax_text_blocks, PerceptionError, PerceptionPipeline};
use super::signals::SignalDetector;
use super::ui_map::{ElementSource, TextBlock, UiElement, UiMap};
use crate::models::{A11yNode, Bounds, Viewport};

/// A recognized OCR text region, already converted into the top-left CSS-pixel
/// space the AX-tree bounds use.
#[derive(Debug, Clone)]
struct OcrRegion {
    text: String,
    bounds: Bounds,
    confidence: f32,
}

/// AX tree + OCR perception pipeline. See module docs.
pub struct VisionPerceptionPipeline {
    converter: AxConverter,
    signal_detector: SignalDetector,
}

impl VisionPerceptionPipeline {
    pub fn new() -> Self {
        Self {
            converter: AxConverter::new(),
            signal_detector: SignalDetector::new(),
        }
    }
}

impl Default for VisionPerceptionPipeline {
    fn default() -> Self {
        Self::new()
    }
}

#[async_trait]
impl PerceptionPipeline for VisionPerceptionPipeline {
    async fn perceive(
        &self,
        screenshot: &[u8],
        a11y_nodes: &[A11yNode],
        url: &str,
        viewport: Viewport,
    ) -> Result<UiMap, PerceptionError> {
        let mut elements = self.converter.convert(a11y_nodes);
        let mut text_blocks = extract_ax_text_blocks(a11y_nodes);
        let page_signals = self.signal_detector.detect(a11y_nodes);

        // OCR augmentation is best-effort: never fail perception because OCR
        // is unavailable or errored — fall back to the AX-only result.
        if !screenshot.is_empty() && car_vision::is_available() {
            let bytes = screenshot.to_vec();
            // Scale OCR's normalized coords by the *actual* CSS viewport, taken
            // from the AX tree (the same coordinate space the element bounds
            // live in) — NOT the passed `viewport`, which some backends report
            // from launch config rather than the live rendered size. Getting
            // this wrong stretches OCR regions off their controls and breaks
            // label recovery (caught by the live test).
            let (cw, ch) = css_viewport_from_ax(a11y_nodes, viewport);
            // car-vision's `recognize` is synchronous and (on macOS) crosses an
            // FFI boundary, so run it off the async executor.
            let ocr = tokio::task::spawn_blocking(move || run_ocr_blocking(&bytes, cw, ch)).await;
            match ocr {
                Ok(Ok(regions)) if !regions.is_empty() => {
                    merge_ocr(&mut elements, &mut text_blocks, &regions);
                }
                Ok(Ok(_)) => {}
                Ok(Err(e)) => tracing::debug!(error = %e, "OCR augmentation skipped"),
                Err(e) => tracing::debug!(error = %e, "OCR task join failed"),
            }
        }

        Ok(UiMap::new(
            url.to_string(),
            elements,
            text_blocks,
            page_signals,
            viewport,
            String::new(),
        ))
    }
}

/// The CSS-pixel viewport the AX element bounds are expressed in. Prefers the
/// root web-area node's size (Chrome reports it as the layout viewport); falls
/// back to the maximum extent of all node bounds, then to the passed viewport.
/// This is the coordinate space OCR regions must be scaled into so overlap with
/// element bounds is meaningful.
fn css_viewport_from_ax(nodes: &[A11yNode], viewport: Viewport) -> (f64, f64) {
    if let Some(root) = nodes.iter().find(|n| {
        let r = n.role.to_lowercase();
        (r.contains("webarea") || r == "rootwebarea")
            && n.bounds.width > 0.0
            && n.bounds.height > 0.0
    }) {
        return (root.bounds.width, root.bounds.height);
    }
    let max_x = nodes
        .iter()
        .map(|n| n.bounds.x + n.bounds.width)
        .fold(0.0_f64, f64::max);
    let max_y = nodes
        .iter()
        .map(|n| n.bounds.y + n.bounds.height)
        .fold(0.0_f64, f64::max);
    if max_x > 0.0 && max_y > 0.0 {
        (max_x, max_y)
    } else {
        (viewport.width as f64, viewport.height as f64)
    }
}

/// Write the screenshot to a temp PNG, run OCR, and convert each observation
/// from Vision's normalized bottom-left space into top-left CSS pixels, scaled
/// by the CSS viewport (`cw` × `ch`) the AX bounds use.
fn run_ocr_blocking(screenshot: &[u8], cw: f64, ch: f64) -> Result<Vec<OcrRegion>, String> {
    let mut tmp = tempfile::Builder::new()
        .suffix(".png")
        .tempfile()
        .map_err(|e| format!("temp file: {e}"))?;
    tmp.write_all(screenshot)
        .map_err(|e| format!("write screenshot: {e}"))?;
    tmp.flush().map_err(|e| format!("flush: {e}"))?;

    let config = car_vision::ocr::OcrConfig {
        // Perception is latency-sensitive and the labels we recover are short;
        // the fast path is the right trade here.
        fast_path: true,
        languages: Vec::new(),
        language_correction: true,
        // Drop sub-pixel noise text; it can't be a useful control label.
        minimum_text_height: 0.0,
    };
    let observations =
        car_vision::ocr::recognize(tmp.path(), &config).map_err(|e| format!("ocr: {e}"))?;

    Ok(observations
        .into_iter()
        .filter(|o| !o.text.trim().is_empty() && o.w > 0.0 && o.h > 0.0)
        .map(|o| OcrRegion {
            text: o.text.trim().to_string(),
            // Vision: normalized [0,1], origin bottom-left, y grows up. Flip Y
            // and scale to CSS pixels to align with the AX-tree bounds.
            bounds: Bounds::new(o.x * cw, (1.0 - o.y - o.h) * ch, o.w * cw, o.h * ch),
            confidence: o.confidence,
        })
        .collect())
}

/// Fuse OCR regions into the AX-derived elements and text blocks. See module
/// docs for the two behaviors (label recovery, invisible text).
fn merge_ocr(elements: &mut [UiElement], text_blocks: &mut Vec<TextBlock>, regions: &[OcrRegion]) {
    // Lowercased corpus of text the AX tree already exposes, for dedup of
    // invisible-text candidates.
    let mut known: Vec<String> = Vec::new();
    for el in elements.iter() {
        if let Some(n) = el.name.as_deref() {
            known.push(n.trim().to_lowercase());
        }
    }
    for tb in text_blocks.iter() {
        known.push(tb.text.trim().to_lowercase());
    }

    for region in regions {
        // Label recovery: the smallest nameless interactable element that the
        // OCR region sits substantially *inside* adopts the OCR text. We require
        // strong containment (≥60% of the region's area inside the element), not
        // just a center hit — this is the load-bearing safety property. OCR
        // bounds are viewport-space; the AX bounds are whatever `getBoxModel`
        // reports. If those spaces ever diverge (e.g. a scrolled page where the
        // AX bounds turn out document-relative), strong containment simply fails
        // to match and the element is left AX-only — a missed enrichment, never
        // a *wrong* label grafted onto the wrong control.
        const MIN_CONTAINMENT: f64 = 0.60;
        let target = elements
            .iter_mut()
            .filter(|el| {
                el.role.is_interactable()
                    && el.is_interactable()
                    && el.name.as_deref().map(str::trim).unwrap_or("").is_empty()
                    && containment_ratio(&region.bounds, &el.bounds) >= MIN_CONTAINMENT
            })
            .min_by(|a, b| {
                let area = |b: &Bounds| b.width * b.height;
                area(&a.bounds)
                    .partial_cmp(&area(&b.bounds))
                    .unwrap_or(std::cmp::Ordering::Equal)
            });

        if let Some(el) = target {
            el.name = Some(region.text.clone());
            el.source = ElementSource::Merged {
                sources: vec![ElementSource::AccessibilityTree, ElementSource::Ocr],
            };
            // Two corroborating sources (AX existence + OCR label) — adopt the
            // merged confidence prior.
            el.confidence = ElementSource::Merged { sources: Vec::new() }.base_confidence();
            known.push(region.text.to_lowercase());
            continue;
        }

        // Invisible text: surface OCR text the AX tree doesn't already carry.
        // Dedup on EXACT (case-insensitive) equality only — substring matching
        // over-suppresses legitimately distinct text (the OCR token "OK" is a
        // substring of an AX label "BOOK"). A rare duplicate line in context is
        // far cheaper than silently dropping real on-screen text.
        let lc = region.text.to_lowercase();
        if !known.iter().any(|k| k == &lc) {
            text_blocks.push(TextBlock::from_ocr(
                region.text.clone(),
                region.bounds,
                region.confidence,
            ));
            known.push(lc);
        }
    }
}

/// Fraction of `region`'s area that lies inside `el` (0.0–1.0). Used to decide
/// whether an OCR text region sits on a control. Returns 0 for a zero-area
/// region.
fn containment_ratio(region: &Bounds, el: &Bounds) -> f64 {
    let ix = region.x.max(el.x);
    let iy = region.y.max(el.y);
    let ix2 = (region.x + region.width).min(el.x + el.width);
    let iy2 = (region.y + region.height).min(el.y + el.height);
    let inter = (ix2 - ix).max(0.0) * (iy2 - iy).max(0.0);
    let region_area = region.width * region.height;
    if region_area <= 0.0 {
        0.0
    } else {
        inter / region_area
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::perception::ui_map::{TextSource, UiRole, UiState};

    fn nameless_button(id: &str, b: Bounds) -> UiElement {
        UiElement {
            id: id.to_string(),
            role: UiRole::Button,
            name: None,
            value: None,
            bounds: b,
            states: UiState::enabled(),
            confidence: 0.9,
            source: ElementSource::AccessibilityTree,
            icon_type: None,
            children: vec![],
            ax_ref: Some(format!("ax-{id}")),
        }
    }

    #[test]
    fn label_recovery_fills_nameless_element_and_marks_merged() {
        let mut els = vec![nameless_button("el_0", Bounds::new(100.0, 100.0, 80.0, 30.0))];
        let mut tbs: Vec<TextBlock> = vec![];
        // OCR text whose center (140,115) sits on the button.
        let regions = vec![OcrRegion {
            text: "Submit".to_string(),
            bounds: Bounds::new(110.0, 105.0, 60.0, 20.0),
            confidence: 0.95,
        }];
        merge_ocr(&mut els, &mut tbs, &regions);
        assert_eq!(els[0].name.as_deref(), Some("Submit"));
        assert!(matches!(els[0].source, ElementSource::Merged { .. }));
        // ax_ref (execution handle) is untouched.
        assert_eq!(els[0].ax_ref.as_deref(), Some("ax-el_0"));
        // Consumed as a label, not duplicated as a text block.
        assert!(tbs.is_empty());
    }

    #[test]
    fn invisible_text_becomes_ocr_text_block() {
        // No element under the OCR region → it's canvas/image text the AX tree
        // missed; surface it as an Ocr text block.
        let mut els: Vec<UiElement> = vec![];
        let mut tbs: Vec<TextBlock> = vec![];
        let regions = vec![OcrRegion {
            text: "Score: 42".to_string(),
            bounds: Bounds::new(500.0, 20.0, 90.0, 18.0),
            confidence: 0.88,
        }];
        merge_ocr(&mut els, &mut tbs, &regions);
        assert_eq!(tbs.len(), 1);
        assert_eq!(tbs[0].text, "Score: 42");
        assert_eq!(tbs[0].source, TextSource::Ocr);
    }

    #[test]
    fn ocr_duplicate_of_ax_text_is_dropped() {
        // OCR re-reading text the AX tree already exposes must not double it.
        let mut els = vec![UiElement {
            name: Some("Welcome back".to_string()),
            ..nameless_button("el_0", Bounds::new(0.0, 0.0, 300.0, 40.0))
        }];
        // Element already named, so it's not a label-recovery target; the OCR
        // region duplicating its text should be deduped, not added.
        let mut tbs: Vec<TextBlock> = vec![];
        let regions = vec![OcrRegion {
            text: "Welcome back".to_string(),
            bounds: Bounds::new(800.0, 800.0, 100.0, 20.0), // far from the element
            confidence: 0.9,
        }];
        merge_ocr(&mut els, &mut tbs, &regions);
        assert!(tbs.is_empty(), "duplicate of an AX name must be dropped");
    }

    #[test]
    fn css_viewport_prefers_rootwebarea_over_passed_viewport() {
        use crate::models::A11yNode;
        // A backend that mis-reports the viewport (launch config 1280x720) but
        // whose AX root web area is the real 800x600 layout viewport — OCR must
        // scale by 800x600 or label recovery breaks (the live-test bug).
        let nodes = vec![A11yNode {
            node_id: "root".into(),
            role: "RootWebArea".into(),
            name: None,
            value: None,
            bounds: Bounds::new(0.0, 0.0, 800.0, 600.0),
            children: vec![],
            focusable: false,
            focused: false,
            disabled: false,
        }];
        let vp = Viewport {
            width: 1280,
            height: 720,
            device_pixel_ratio: 1.0,
        };
        assert_eq!(css_viewport_from_ax(&nodes, vp), (800.0, 600.0));
    }

    #[test]
    fn css_viewport_falls_back_to_max_extent_then_viewport() {
        use crate::models::A11yNode;
        // No web-area node → use the max extent of element bounds.
        let nodes = vec![A11yNode {
            node_id: "b".into(),
            role: "button".into(),
            name: Some("x".into()),
            value: None,
            bounds: Bounds::new(10.0, 20.0, 100.0, 30.0), // extent 110x50
            children: vec![],
            focusable: true,
            focused: false,
            disabled: false,
        }];
        let vp = Viewport {
            width: 1280,
            height: 720,
            device_pixel_ratio: 1.0,
        };
        assert_eq!(css_viewport_from_ax(&nodes, vp), (110.0, 50.0));
        // Empty tree → fall back to the passed viewport.
        assert_eq!(css_viewport_from_ax(&[], vp), (1280.0, 720.0));
    }

    #[test]
    fn named_element_is_not_relabeled() {
        // A control that already has a good name must not be overwritten by an
        // overlapping OCR region.
        let mut els = vec![UiElement {
            name: Some("Sign in".to_string()),
            ..nameless_button("el_0", Bounds::new(100.0, 100.0, 80.0, 30.0))
        }];
        let mut tbs: Vec<TextBlock> = vec![];
        let regions = vec![OcrRegion {
            text: "garbled ocr".to_string(),
            bounds: Bounds::new(120.0, 108.0, 40.0, 14.0),
            confidence: 0.4,
        }];
        merge_ocr(&mut els, &mut tbs, &regions);
        assert_eq!(els[0].name.as_deref(), Some("Sign in"));
        assert!(matches!(els[0].source, ElementSource::AccessibilityTree));
    }
}