car-browser 0.32.0

Browser automation and perception pipeline for Common Agent Runtime
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
//! Vision-augmented perception: accessibility tree + OCR fusion.
//!
//! The AX-only pipeline ([`super::pipeline::BasicPerceptionPipeline`]) is blind
//! to anything the accessibility tree omits — canvas/WebGL-rendered UIs, text
//! baked into images, and controls with an empty accessible name. This pipeline
//! runs OCR over the screenshot and fuses the result with the AX elements two
//! ways:
//!
//! - **Label recovery:** an interactable AX element with no accessible name
//!   adopts the label from an OCR text region sitting on it, and its source
//!   becomes [`ElementSource::Merged`]. The element's `ax_ref` (and therefore
//!   deterministic execution) is unchanged — only the human-readable handle is
//!   recovered.
//! - **Invisible text:** OCR text not already present in the AX tree is added
//!   as an [`TextSource::Ocr`] text block, so the agent can read canvas/image
//!   text it otherwise couldn't see.
//!
//! OCR cannot tell whether a pixel region is *clickable*, so this pipeline never
//! fabricates interactive elements out of OCR regions — doing so would invite
//! the agent to "click" non-controls. A future visual element detector is what
//! would legitimately populate [`ElementSource::VisualDetector`]; the merge slot
//! is already here for it.
//!
//! OCR is gated at runtime by [`car_vision::is_available`] (Apple Vision on
//! macOS, Tesseract CLI elsewhere). When no backend is present, or OCR fails,
//! the pipeline degrades cleanly to AX-only output.

use async_trait::async_trait;
use std::io::Write as _;

use super::ax_converter::AxConverter;
use super::pipeline::{extract_ax_text_blocks, PerceptionError, PerceptionPipeline};
use super::signals::SignalDetector;
use super::ui_map::{ElementSource, TextBlock, UiElement, UiMap};
use crate::models::{A11yNode, Bounds, Viewport};

/// A recognized OCR text region, already converted into the top-left CSS-pixel
/// space the AX-tree bounds use.
#[derive(Debug, Clone)]
struct OcrRegion {
    text: String,
    bounds: Bounds,
    confidence: f32,
}

/// AX tree + OCR perception pipeline. See module docs.
pub struct VisionPerceptionPipeline {
    converter: AxConverter,
    signal_detector: SignalDetector,
}

impl VisionPerceptionPipeline {
    pub fn new() -> Self {
        Self {
            converter: AxConverter::new(),
            signal_detector: SignalDetector::new(),
        }
    }
}

impl Default for VisionPerceptionPipeline {
    fn default() -> Self {
        Self::new()
    }
}

#[async_trait]
impl PerceptionPipeline for VisionPerceptionPipeline {
    async fn perceive(
        &self,
        screenshot: &[u8],
        a11y_nodes: &[A11yNode],
        url: &str,
        viewport: Viewport,
    ) -> Result<UiMap, PerceptionError> {
        let mut elements = self.converter.convert(a11y_nodes);
        let mut text_blocks = extract_ax_text_blocks(a11y_nodes);
        let page_signals = self.signal_detector.detect(a11y_nodes);

        // OCR augmentation is best-effort: never fail perception because OCR
        // is unavailable or errored — fall back to the AX-only result.
        if !screenshot.is_empty() && car_vision::is_available() {
            let bytes = screenshot.to_vec();
            // Scale OCR's normalized coords by the *actual* CSS viewport, taken
            // from the AX tree (the same coordinate space the element bounds
            // live in) — NOT the passed `viewport`, which some backends report
            // from launch config rather than the live rendered size. Getting
            // this wrong stretches OCR regions off their controls and breaks
            // label recovery (caught by the live test).
            let (cw, ch) = css_viewport_from_ax(a11y_nodes, viewport);
            // car-vision's `recognize` is synchronous and (on macOS) crosses an
            // FFI boundary, so run it off the async executor.
            let ocr = tokio::task::spawn_blocking(move || run_ocr_blocking(&bytes, cw, ch)).await;
            match ocr {
                Ok(Ok(regions)) if !regions.is_empty() => {
                    merge_ocr(&mut elements, &mut text_blocks, &regions);
                }
                Ok(Ok(_)) => {}
                Ok(Err(e)) => tracing::debug!(error = %e, "OCR augmentation skipped"),
                Err(e) => tracing::debug!(error = %e, "OCR task join failed"),
            }
        }

        Ok(UiMap::new(
            url.to_string(),
            elements,
            text_blocks,
            page_signals,
            viewport,
            String::new(),
        ))
    }
}

/// The CSS-pixel viewport the AX element bounds are expressed in. Prefers the
/// root web-area node's size (Chrome reports it as the layout viewport); falls
/// back to the maximum extent of all node bounds, then to the passed viewport.
/// This is the coordinate space OCR regions must be scaled into so overlap with
/// element bounds is meaningful.
fn css_viewport_from_ax(nodes: &[A11yNode], viewport: Viewport) -> (f64, f64) {
    if let Some(root) = nodes.iter().find(|n| {
        let r = n.role.to_lowercase();
        (r.contains("webarea") || r == "rootwebarea")
            && n.bounds.width > 0.0
            && n.bounds.height > 0.0
    }) {
        return (root.bounds.width, root.bounds.height);
    }
    let max_x = nodes
        .iter()
        .map(|n| n.bounds.x + n.bounds.width)
        .fold(0.0_f64, f64::max);
    let max_y = nodes
        .iter()
        .map(|n| n.bounds.y + n.bounds.height)
        .fold(0.0_f64, f64::max);
    if max_x > 0.0 && max_y > 0.0 {
        (max_x, max_y)
    } else {
        (viewport.width as f64, viewport.height as f64)
    }
}

/// Write the screenshot to a temp PNG, run OCR, and convert each observation
/// from Vision's normalized bottom-left space into top-left CSS pixels, scaled
/// by the CSS viewport (`cw` × `ch`) the AX bounds use.
fn run_ocr_blocking(screenshot: &[u8], cw: f64, ch: f64) -> Result<Vec<OcrRegion>, String> {
    let mut tmp = tempfile::Builder::new()
        .suffix(".png")
        .tempfile()
        .map_err(|e| format!("temp file: {e}"))?;
    tmp.write_all(screenshot)
        .map_err(|e| format!("write screenshot: {e}"))?;
    tmp.flush().map_err(|e| format!("flush: {e}"))?;

    let config = car_vision::ocr::OcrConfig {
        // Perception is latency-sensitive and the labels we recover are short;
        // the fast path is the right trade here.
        fast_path: true,
        languages: Vec::new(),
        language_correction: true,
        // Drop sub-pixel noise text; it can't be a useful control label.
        minimum_text_height: 0.0,
    };
    let observations =
        car_vision::ocr::recognize(tmp.path(), &config).map_err(|e| format!("ocr: {e}"))?;

    Ok(observations
        .into_iter()
        .filter(|o| !o.text.trim().is_empty() && o.w > 0.0 && o.h > 0.0)
        .map(|o| OcrRegion {
            text: o.text.trim().to_string(),
            // Vision: normalized [0,1], origin bottom-left, y grows up. Flip Y
            // and scale to CSS pixels to align with the AX-tree bounds.
            bounds: Bounds::new(o.x * cw, (1.0 - o.y - o.h) * ch, o.w * cw, o.h * ch),
            confidence: o.confidence,
        })
        .collect())
}

/// Fuse OCR regions into the AX-derived elements and text blocks. See module
/// docs for the two behaviors (label recovery, invisible text).
fn merge_ocr(elements: &mut [UiElement], text_blocks: &mut Vec<TextBlock>, regions: &[OcrRegion]) {
    // Lowercased corpus of text the AX tree already exposes, for dedup of
    // invisible-text candidates.
    let mut known: Vec<String> = Vec::new();
    for el in elements.iter() {
        if let Some(n) = el.name.as_deref() {
            known.push(n.trim().to_lowercase());
        }
    }
    for tb in text_blocks.iter() {
        known.push(tb.text.trim().to_lowercase());
    }

    for region in regions {
        // Label recovery: the smallest nameless interactable element that the
        // OCR region sits substantially *inside* adopts the OCR text. We require
        // strong containment (≥60% of the region's area inside the element), not
        // just a center hit — this is the load-bearing safety property. OCR
        // bounds are viewport-space; the AX bounds are whatever `getBoxModel`
        // reports. If those spaces ever diverge (e.g. a scrolled page where the
        // AX bounds turn out document-relative), strong containment simply fails
        // to match and the element is left AX-only — a missed enrichment, never
        // a *wrong* label grafted onto the wrong control.
        const MIN_CONTAINMENT: f64 = 0.60;
        let target = elements
            .iter_mut()
            .filter(|el| {
                el.role.is_interactable()
                    && el.is_interactable()
                    && el.name.as_deref().map(str::trim).unwrap_or("").is_empty()
                    && containment_ratio(&region.bounds, &el.bounds) >= MIN_CONTAINMENT
            })
            .min_by(|a, b| {
                let area = |b: &Bounds| b.width * b.height;
                area(&a.bounds)
                    .partial_cmp(&area(&b.bounds))
                    .unwrap_or(std::cmp::Ordering::Equal)
            });

        if let Some(el) = target {
            el.name = Some(region.text.clone());
            el.source = ElementSource::Merged {
                sources: vec![ElementSource::AccessibilityTree, ElementSource::Ocr],
            };
            // Two corroborating sources (AX existence + OCR label) — adopt the
            // merged confidence prior.
            el.confidence = ElementSource::Merged { sources: Vec::new() }.base_confidence();
            known.push(region.text.to_lowercase());
            continue;
        }

        // Invisible text: surface OCR text the AX tree doesn't already carry.
        // Dedup on EXACT (case-insensitive) equality only — substring matching
        // over-suppresses legitimately distinct text (the OCR token "OK" is a
        // substring of an AX label "BOOK"). A rare duplicate line in context is
        // far cheaper than silently dropping real on-screen text.
        let lc = region.text.to_lowercase();
        if !known.iter().any(|k| k == &lc) {
            text_blocks.push(TextBlock::from_ocr(
                region.text.clone(),
                region.bounds,
                region.confidence,
            ));
            known.push(lc);
        }
    }
}

/// Fraction of `region`'s area that lies inside `el` (0.0–1.0). Used to decide
/// whether an OCR text region sits on a control. Returns 0 for a zero-area
/// region.
fn containment_ratio(region: &Bounds, el: &Bounds) -> f64 {
    let ix = region.x.max(el.x);
    let iy = region.y.max(el.y);
    let ix2 = (region.x + region.width).min(el.x + el.width);
    let iy2 = (region.y + region.height).min(el.y + el.height);
    let inter = (ix2 - ix).max(0.0) * (iy2 - iy).max(0.0);
    let region_area = region.width * region.height;
    if region_area <= 0.0 {
        0.0
    } else {
        inter / region_area
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::perception::ui_map::{TextSource, UiRole, UiState};

    fn nameless_button(id: &str, b: Bounds) -> UiElement {
        UiElement {
            id: id.to_string(),
            role: UiRole::Button,
            name: None,
            value: None,
            bounds: b,
            states: UiState::enabled(),
            confidence: 0.9,
            source: ElementSource::AccessibilityTree,
            icon_type: None,
            children: vec![],
            ax_ref: Some(format!("ax-{id}")),
        }
    }

    #[test]
    fn label_recovery_fills_nameless_element_and_marks_merged() {
        let mut els = vec![nameless_button("el_0", Bounds::new(100.0, 100.0, 80.0, 30.0))];
        let mut tbs: Vec<TextBlock> = vec![];
        // OCR text whose center (140,115) sits on the button.
        let regions = vec![OcrRegion {
            text: "Submit".to_string(),
            bounds: Bounds::new(110.0, 105.0, 60.0, 20.0),
            confidence: 0.95,
        }];
        merge_ocr(&mut els, &mut tbs, &regions);
        assert_eq!(els[0].name.as_deref(), Some("Submit"));
        assert!(matches!(els[0].source, ElementSource::Merged { .. }));
        // ax_ref (execution handle) is untouched.
        assert_eq!(els[0].ax_ref.as_deref(), Some("ax-el_0"));
        // Consumed as a label, not duplicated as a text block.
        assert!(tbs.is_empty());
    }

    #[test]
    fn invisible_text_becomes_ocr_text_block() {
        // No element under the OCR region → it's canvas/image text the AX tree
        // missed; surface it as an Ocr text block.
        let mut els: Vec<UiElement> = vec![];
        let mut tbs: Vec<TextBlock> = vec![];
        let regions = vec![OcrRegion {
            text: "Score: 42".to_string(),
            bounds: Bounds::new(500.0, 20.0, 90.0, 18.0),
            confidence: 0.88,
        }];
        merge_ocr(&mut els, &mut tbs, &regions);
        assert_eq!(tbs.len(), 1);
        assert_eq!(tbs[0].text, "Score: 42");
        assert_eq!(tbs[0].source, TextSource::Ocr);
    }

    #[test]
    fn ocr_duplicate_of_ax_text_is_dropped() {
        // OCR re-reading text the AX tree already exposes must not double it.
        let mut els = vec![UiElement {
            name: Some("Welcome back".to_string()),
            ..nameless_button("el_0", Bounds::new(0.0, 0.0, 300.0, 40.0))
        }];
        // Element already named, so it's not a label-recovery target; the OCR
        // region duplicating its text should be deduped, not added.
        let mut tbs: Vec<TextBlock> = vec![];
        let regions = vec![OcrRegion {
            text: "Welcome back".to_string(),
            bounds: Bounds::new(800.0, 800.0, 100.0, 20.0), // far from the element
            confidence: 0.9,
        }];
        merge_ocr(&mut els, &mut tbs, &regions);
        assert!(tbs.is_empty(), "duplicate of an AX name must be dropped");
    }

    #[test]
    fn css_viewport_prefers_rootwebarea_over_passed_viewport() {
        use crate::models::A11yNode;
        // A backend that mis-reports the viewport (launch config 1280x720) but
        // whose AX root web area is the real 800x600 layout viewport — OCR must
        // scale by 800x600 or label recovery breaks (the live-test bug).
        let nodes = vec![A11yNode {
            node_id: "root".into(),
            role: "RootWebArea".into(),
            name: None,
            value: None,
            bounds: Bounds::new(0.0, 0.0, 800.0, 600.0),
            children: vec![],
            focusable: false,
            focused: false,
            disabled: false,
        }];
        let vp = Viewport {
            width: 1280,
            height: 720,
            device_pixel_ratio: 1.0,
        };
        assert_eq!(css_viewport_from_ax(&nodes, vp), (800.0, 600.0));
    }

    #[test]
    fn css_viewport_falls_back_to_max_extent_then_viewport() {
        use crate::models::A11yNode;
        // No web-area node → use the max extent of element bounds.
        let nodes = vec![A11yNode {
            node_id: "b".into(),
            role: "button".into(),
            name: Some("x".into()),
            value: None,
            bounds: Bounds::new(10.0, 20.0, 100.0, 30.0), // extent 110x50
            children: vec![],
            focusable: true,
            focused: false,
            disabled: false,
        }];
        let vp = Viewport {
            width: 1280,
            height: 720,
            device_pixel_ratio: 1.0,
        };
        assert_eq!(css_viewport_from_ax(&nodes, vp), (110.0, 50.0));
        // Empty tree → fall back to the passed viewport.
        assert_eq!(css_viewport_from_ax(&[], vp), (1280.0, 720.0));
    }

    #[test]
    fn named_element_is_not_relabeled() {
        // A control that already has a good name must not be overwritten by an
        // overlapping OCR region.
        let mut els = vec![UiElement {
            name: Some("Sign in".to_string()),
            ..nameless_button("el_0", Bounds::new(100.0, 100.0, 80.0, 30.0))
        }];
        let mut tbs: Vec<TextBlock> = vec![];
        let regions = vec![OcrRegion {
            text: "garbled ocr".to_string(),
            bounds: Bounds::new(120.0, 108.0, 40.0, 14.0),
            confidence: 0.4,
        }];
        merge_ocr(&mut els, &mut tbs, &regions);
        assert_eq!(els[0].name.as_deref(), Some("Sign in"));
        assert!(matches!(els[0].source, ElementSource::AccessibilityTree));
    }
}