Skip to main content

car_browser/perception/
ui_map.rs

1//! UI state representation for perception pipeline.
2//!
3//! UiMap is the core data structure that represents what's visible on screen
4//! in a structured, auditable format.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use sha2::{Digest, Sha256};
9
10use crate::models::{Bounds, Viewport};
11
12/// Complete UI state representation.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct UiMap {
15    pub id: String,
16    pub timestamp: DateTime<Utc>,
17    pub url: String,
18    pub elements: Vec<UiElement>,
19    pub text_blocks: Vec<TextBlock>,
20    pub page_signals: PageSignals,
21    pub viewport: Viewport,
22    pub content_hash: String,
23    pub screenshot_path: String,
24}
25
26impl UiMap {
27    pub fn new(
28        url: String,
29        elements: Vec<UiElement>,
30        text_blocks: Vec<TextBlock>,
31        page_signals: PageSignals,
32        viewport: Viewport,
33        screenshot_path: String,
34    ) -> Self {
35        let id = uuid::Uuid::new_v4().to_string();
36        let timestamp = Utc::now();
37        let mut map = Self {
38            id,
39            timestamp,
40            url,
41            elements,
42            text_blocks,
43            page_signals,
44            viewport,
45            content_hash: String::new(),
46            screenshot_path,
47        };
48        map.content_hash = map.compute_content_hash();
49        map
50    }
51
52    pub fn compute_content_hash(&self) -> String {
53        let mut hasher = Sha256::new();
54        for element in &self.elements {
55            hasher.update(element.id.as_bytes());
56            hasher.update(element.role.to_hash_string().as_bytes());
57            if let Some(name) = &element.name {
58                hasher.update(name.as_bytes());
59            }
60            hasher.update(element.states.to_hash_string().as_bytes());
61        }
62        for block in &self.text_blocks {
63            hasher.update(block.text.as_bytes());
64        }
65        hasher.update(self.page_signals.to_hash_string().as_bytes());
66        hex::encode(hasher.finalize())
67    }
68
69    pub fn get_element(&self, element_id: &str) -> Option<&UiElement> {
70        self.elements.iter().find(|e| e.id == element_id)
71    }
72
73    pub fn get_elements_by_role(&self, role: UiRole) -> Vec<&UiElement> {
74        self.elements
75            .iter()
76            .filter(|e| std::mem::discriminant(&e.role) == std::mem::discriminant(&role))
77            .collect()
78    }
79
80    pub fn interactive_elements(&self) -> Vec<&UiElement> {
81        self.elements
82            .iter()
83            .filter(|e| e.role.is_interactable() && e.is_interactable())
84            .collect()
85    }
86
87    pub fn estimate_tokens(&self, interactive_only: bool) -> usize {
88        let count = if interactive_only {
89            self.interactive_elements().len()
90        } else {
91            self.elements.len()
92        };
93        count * 20 + 30
94    }
95
96    pub fn average_confidence(&self) -> f32 {
97        if self.elements.is_empty() {
98            return 0.0;
99        }
100        let sum: f32 = self.elements.iter().map(|e| e.confidence).sum();
101        sum / self.elements.len() as f32
102    }
103
104    /// Format as a structured page summary for LLM consumption.
105    /// Includes: page signals, visible text content, and interactive elements.
106    /// Targets ~4KB output (vs 86KB for raw accessibility tree).
107    pub fn format_summary(&self) -> String {
108        use std::fmt::Write;
109        let mut output = String::new();
110
111        // Page signals
112        if self.page_signals.has_blocking_element() {
113            if self.page_signals.modal_present {
114                let _ = writeln!(output, "⚠ Modal dialog present");
115            }
116            if self.page_signals.cookie_banner {
117                let _ = writeln!(output, "⚠ Cookie banner present");
118            }
119        }
120        if self.page_signals.loading_indicator {
121            let _ = writeln!(output, "⏳ Page loading...");
122        }
123
124        // Visible text content — what a user would actually see
125        let _ = writeln!(output, "\n## Visible Text");
126        let mut seen_texts: std::collections::HashSet<String> = std::collections::HashSet::new();
127        for el in &self.elements {
128            if let Some(ref name) = el.name {
129                let text = name.trim().to_string();
130                if !text.is_empty() && text.len() > 1 && seen_texts.insert(text.clone()) {
131                    let role = el.role.to_hash_string();
132                    let truncated = if text.len() > 80 {
133                        let end = text.floor_char_boundary(77);
134                        format!("{}...", &text[..end])
135                    } else {
136                        text
137                    };
138                    let _ = writeln!(output, "  ({}) {}", role, truncated);
139                }
140            }
141        }
142
143        // Interactive elements — what the agent can click/type
144        let _ = writeln!(output, "\n## Interactive Elements");
145        let interactive = self.interactive_elements();
146        for (i, el) in interactive.iter().enumerate().take(50) {
147            let role_str = el.role.to_hash_string();
148            let name_str = el.name.as_deref()
149                .map(|n| {
150                    if n.len() > 40 {
151                        let end = n.floor_char_boundary(37);
152                        format!(" \"{}...\"", &n[..end])
153                    } else {
154                        format!(" \"{}\"", n)
155                    }
156                })
157                .unwrap_or_default();
158            let mut state_parts = Vec::new();
159            if element_states_for_summary(&el.states, &mut state_parts) {
160                let _ = writeln!(output, "[{}] {}{} {}", el.id, role_str, name_str, state_parts.join(" "));
161            } else {
162                let _ = writeln!(output, "[{}] {}{}", el.id, role_str, name_str);
163            }
164            let _ = i; // suppress unused
165        }
166        if interactive.len() > 50 {
167            let _ = writeln!(output, "  ... and {} more interactive elements", interactive.len() - 50);
168        }
169
170        output
171    }
172
173    /// Format as compact text for LLM consumption.
174    ///
175    /// Format: `[el_0] Button "Submit" (120,340) focused`
176    pub fn format_compact(&self) -> String {
177        use std::fmt::Write;
178        let mut output = String::new();
179
180        // Page signals header
181        if self.page_signals.has_blocking_element() {
182            if self.page_signals.modal_present {
183                let _ = writeln!(output, "⚠ Modal dialog present");
184            }
185            if self.page_signals.cookie_banner {
186                let _ = writeln!(output, "⚠ Cookie banner present");
187            }
188        }
189        if self.page_signals.loading_indicator {
190            let _ = writeln!(output, "⏳ Page loading...");
191        }
192
193        // Use interactive-only if >40 elements (compact mode)
194        let elements: Vec<&UiElement> = if self.elements.len() > 40 {
195            self.interactive_elements()
196        } else {
197            self.elements.iter().collect()
198        };
199
200        for element in &elements {
201            let role_str = element.role.to_hash_string();
202            let name_str = element
203                .name
204                .as_deref()
205                .map(|n| {
206                    if n.len() > 50 {
207                        let truncated: String = n.chars().take(47).collect();
208                        format!(" \"{}...\"", truncated)
209                    } else {
210                        format!(" \"{}\"", n)
211                    }
212                })
213                .unwrap_or_default();
214
215            let (cx, cy) = element.bounds.center();
216            let pos_str = format!(" ({:.0},{:.0})", cx, cy);
217
218            let mut state_parts = Vec::new();
219            if element.states.focused {
220                state_parts.push("focused");
221            }
222            if !element.states.enabled {
223                state_parts.push("disabled");
224            }
225            if element.states.checked == Some(true) {
226                state_parts.push("checked");
227            }
228            if element.states.expanded == Some(true) {
229                state_parts.push("expanded");
230            }
231            let state_str = if state_parts.is_empty() {
232                String::new()
233            } else {
234                format!(" {}", state_parts.join(" "))
235            };
236
237            let _ = writeln!(
238                output,
239                "[{}] {}{}{}{}",
240                element.id, role_str, name_str, pos_str, state_str
241            );
242        }
243
244        output
245    }
246}
247
248fn element_states_for_summary(states: &UiState, parts: &mut Vec<&'static str>) -> bool {
249    if states.focused { parts.push("focused"); }
250    if !states.enabled { parts.push("disabled"); }
251    if states.checked == Some(true) { parts.push("checked"); }
252    if states.expanded == Some(true) { parts.push("expanded"); }
253    !parts.is_empty()
254}
255
256/// A single interactable UI element.
257#[derive(Debug, Clone, Serialize, Deserialize)]
258pub struct UiElement {
259    pub id: String,
260    pub role: UiRole,
261    pub name: Option<String>,
262    /// Current value (for form inputs — text content, selected option, etc.)
263    pub value: Option<String>,
264    pub bounds: Bounds,
265    pub states: UiState,
266    pub confidence: f32,
267    pub source: ElementSource,
268    pub icon_type: Option<IconType>,
269    pub children: Vec<String>,
270    /// Original AX node ID for execution mapping.
271    pub ax_ref: Option<String>,
272}
273
274impl UiElement {
275    pub fn is_interactable(&self) -> bool {
276        self.states.enabled && self.bounds.width > 0.0 && self.bounds.height > 0.0
277    }
278
279    pub fn center(&self) -> (f64, f64) {
280        self.bounds.center()
281    }
282
283    pub fn accepts_text(&self) -> bool {
284        matches!(self.role, UiRole::TextInput)
285    }
286
287    pub fn is_clickable(&self) -> bool {
288        matches!(self.role, UiRole::Button | UiRole::Link)
289    }
290}
291
292/// Element roles (simplified from full ARIA).
293#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
294#[serde(rename_all = "snake_case")]
295pub enum UiRole {
296    Button,
297    Link,
298    TextInput,
299    Checkbox,
300    Radio,
301    Dropdown,
302    Menu,
303    MenuItem,
304    Tab,
305    Dialog,
306    Image,
307    Text,
308    Container,
309    List,
310    ListItem,
311    Table,
312    TableRow,
313    TableCell,
314    Toolbar,
315    Other(String),
316}
317
318impl UiRole {
319    pub fn from_ax_role(ax_role: &str) -> Self {
320        match ax_role.to_lowercase().as_str() {
321            "button" | "pushbutton" => UiRole::Button,
322            "link" | "weblink" => UiRole::Link,
323            "textbox" | "textfield" | "textarea" | "combobox" | "searchfield" => UiRole::TextInput,
324            "checkbox" => UiRole::Checkbox,
325            "radio" | "radiobutton" => UiRole::Radio,
326            "select" | "listbox" | "popupbutton" => UiRole::Dropdown,
327            "menu" | "menubar" => UiRole::Menu,
328            "menuitem" | "menuitemcheckbox" | "menuitemradio" => UiRole::MenuItem,
329            "tab" | "tabitem" => UiRole::Tab,
330            "dialog" | "alertdialog" | "sheet" => UiRole::Dialog,
331            "image" | "img" => UiRole::Image,
332            "statictext" | "label" | "heading" => UiRole::Text,
333            "group" | "generic" | "section" | "div" | "webarea" => UiRole::Container,
334            "list" => UiRole::List,
335            "listitem" => UiRole::ListItem,
336            "table" | "grid" => UiRole::Table,
337            "row" | "tablerow" => UiRole::TableRow,
338            "cell" | "tablecell" | "gridcell" => UiRole::TableCell,
339            "toolbar" => UiRole::Toolbar,
340            other => UiRole::Other(other.to_string()),
341        }
342    }
343
344    pub fn is_interactable(&self) -> bool {
345        matches!(
346            self,
347            UiRole::Button
348                | UiRole::Link
349                | UiRole::TextInput
350                | UiRole::Checkbox
351                | UiRole::Radio
352                | UiRole::Dropdown
353                | UiRole::MenuItem
354                | UiRole::Tab
355        )
356    }
357
358    pub fn to_hash_string(&self) -> String {
359        match self {
360            UiRole::Button => "button".to_string(),
361            UiRole::Link => "link".to_string(),
362            UiRole::TextInput => "text_input".to_string(),
363            UiRole::Checkbox => "checkbox".to_string(),
364            UiRole::Radio => "radio".to_string(),
365            UiRole::Dropdown => "dropdown".to_string(),
366            UiRole::Menu => "menu".to_string(),
367            UiRole::MenuItem => "menu_item".to_string(),
368            UiRole::Tab => "tab".to_string(),
369            UiRole::Dialog => "dialog".to_string(),
370            UiRole::Image => "image".to_string(),
371            UiRole::Text => "text".to_string(),
372            UiRole::Container => "container".to_string(),
373            UiRole::List => "list".to_string(),
374            UiRole::ListItem => "list_item".to_string(),
375            UiRole::Table => "table".to_string(),
376            UiRole::TableRow => "table_row".to_string(),
377            UiRole::TableCell => "table_cell".to_string(),
378            UiRole::Toolbar => "toolbar".to_string(),
379            UiRole::Other(s) => format!("other:{}", s),
380        }
381    }
382}
383
384/// Element state flags.
385#[derive(Debug, Clone, Default, Serialize, Deserialize)]
386pub struct UiState {
387    pub enabled: bool,
388    pub focused: bool,
389    pub selected: bool,
390    pub checked: Option<bool>,
391    pub expanded: Option<bool>,
392    pub readonly: bool,
393    pub required: bool,
394}
395
396impl UiState {
397    pub fn enabled() -> Self {
398        Self {
399            enabled: true,
400            ..Default::default()
401        }
402    }
403
404    pub fn disabled() -> Self {
405        Self {
406            enabled: false,
407            ..Default::default()
408        }
409    }
410
411    pub fn from_ax_states(
412        disabled: bool,
413        focused: bool,
414        selected: Option<bool>,
415        checked: Option<bool>,
416        expanded: Option<bool>,
417    ) -> Self {
418        Self {
419            enabled: !disabled,
420            focused,
421            selected: selected.unwrap_or(false),
422            checked,
423            expanded,
424            readonly: false,
425            required: false,
426        }
427    }
428
429    pub fn to_hash_string(&self) -> String {
430        fn opt_bool_str(opt: Option<bool>) -> &'static str {
431            match opt {
432                None => "none",
433                Some(true) => "true",
434                Some(false) => "false",
435            }
436        }
437        format!(
438            "en:{},fo:{},se:{},ch:{},ex:{},ro:{},rq:{}",
439            self.enabled,
440            self.focused,
441            self.selected,
442            opt_bool_str(self.checked),
443            opt_bool_str(self.expanded),
444            self.readonly,
445            self.required
446        )
447    }
448}
449
450#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
451#[serde(rename_all = "snake_case")]
452pub enum ElementSource {
453    AccessibilityTree,
454    VisualDetector,
455    Ocr,
456    Merged { sources: Vec<ElementSource> },
457}
458
459impl ElementSource {
460    pub fn base_confidence(&self) -> f32 {
461        match self {
462            ElementSource::AccessibilityTree => 0.90,
463            ElementSource::VisualDetector => 0.75,
464            ElementSource::Ocr => 0.70,
465            ElementSource::Merged { .. } => 0.98,
466        }
467    }
468}
469
470#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
471#[serde(rename_all = "snake_case")]
472pub enum IconType {
473    Close, Menu, Search, Back, Forward, Refresh, Settings, Share,
474    Download, Upload, Edit, Delete, Add, Remove, Expand, Collapse,
475    Play, Pause, Stop, Mute, Unmute, Fullscreen, ExitFullscreen,
476    Info, Help, Warning, Error, Success, Unknown,
477}
478
479#[derive(Debug, Clone, Serialize, Deserialize)]
480pub struct TextBlock {
481    pub text: String,
482    pub bounds: Bounds,
483    pub source: TextSource,
484    pub confidence: f32,
485}
486
487impl TextBlock {
488    pub fn from_ax(text: String, bounds: Bounds) -> Self {
489        Self {
490            text,
491            bounds,
492            source: TextSource::AccessibilityTree,
493            confidence: 1.0,
494        }
495    }
496}
497
498#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
499#[serde(rename_all = "snake_case")]
500pub enum TextSource {
501    AccessibilityTree,
502    Ocr,
503}
504
505#[derive(Debug, Clone, Default, Serialize, Deserialize)]
506pub struct PageSignals {
507    pub modal_present: bool,
508    pub cookie_banner: bool,
509    pub error_banner: bool,
510    pub loading_indicator: bool,
511    pub scroll_position: f32,
512    pub page_type_hint: Option<String>,
513}
514
515impl PageSignals {
516    pub fn has_blocking_element(&self) -> bool {
517        self.modal_present || self.cookie_banner
518    }
519
520    pub fn needs_special_handling(&self) -> bool {
521        matches!(
522            self.page_type_hint.as_deref(),
523            Some("login") | Some("checkout") | Some("payment")
524        )
525    }
526
527    pub fn to_hash_string(&self) -> String {
528        format!(
529            "mo:{},co:{},er:{},lo:{},sc:{:.2},ty:{}",
530            self.modal_present,
531            self.cookie_banner,
532            self.error_banner,
533            self.loading_indicator,
534            self.scroll_position,
535            self.page_type_hint.as_deref().unwrap_or("none")
536        )
537    }
538}
539
540#[cfg(test)]
541mod tests {
542    use super::*;
543
544    #[test]
545    fn test_ui_role_from_ax() {
546        assert_eq!(UiRole::from_ax_role("button"), UiRole::Button);
547        assert_eq!(UiRole::from_ax_role("textfield"), UiRole::TextInput);
548        assert!(matches!(UiRole::from_ax_role("custom"), UiRole::Other(_)));
549    }
550
551    #[test]
552    fn test_format_compact() {
553        let viewport = Viewport { width: 1280, height: 720, device_pixel_ratio: 2.0 };
554        let map = UiMap::new(
555            "https://example.com".to_string(),
556            vec![UiElement {
557                id: "el_0".to_string(),
558                role: UiRole::Button,
559                name: Some("Submit".to_string()),
560                value: None,
561                bounds: Bounds::new(100.0, 100.0, 80.0, 30.0),
562                states: UiState { focused: true, ..UiState::enabled() },
563                confidence: 0.95,
564                source: ElementSource::AccessibilityTree,
565                icon_type: None,
566                children: vec![],
567                ax_ref: None,
568            }],
569            vec![],
570            PageSignals::default(),
571            viewport,
572            String::new(),
573        );
574        let compact = map.format_compact();
575        assert!(compact.contains("[el_0]"));
576        assert!(compact.contains("button"));
577        assert!(compact.contains("Submit"));
578        assert!(compact.contains("focused"));
579    }
580}