Skip to main content

car_browser/perception/
ui_map.rs

1//! UI state representation for perception pipeline.
2//!
3//! UiMap is the core data structure that represents what's visible on screen
4//! in a structured, auditable format.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use sha2::{Digest, Sha256};
9
10use crate::models::{Bounds, Viewport};
11
12/// Complete UI state representation.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct UiMap {
15    pub id: String,
16    pub timestamp: DateTime<Utc>,
17    pub url: String,
18    pub elements: Vec<UiElement>,
19    pub text_blocks: Vec<TextBlock>,
20    pub page_signals: PageSignals,
21    pub viewport: Viewport,
22    pub content_hash: String,
23    pub screenshot_path: String,
24}
25
26impl UiMap {
27    pub fn new(
28        url: String,
29        elements: Vec<UiElement>,
30        text_blocks: Vec<TextBlock>,
31        page_signals: PageSignals,
32        viewport: Viewport,
33        screenshot_path: String,
34    ) -> Self {
35        let id = uuid::Uuid::new_v4().to_string();
36        let timestamp = Utc::now();
37        let mut map = Self {
38            id,
39            timestamp,
40            url,
41            elements,
42            text_blocks,
43            page_signals,
44            viewport,
45            content_hash: String::new(),
46            screenshot_path,
47        };
48        map.content_hash = map.compute_content_hash();
49        map
50    }
51
52    pub fn compute_content_hash(&self) -> String {
53        let mut hasher = Sha256::new();
54        for element in &self.elements {
55            hasher.update(element.id.as_bytes());
56            hasher.update(element.role.to_hash_string().as_bytes());
57            if let Some(name) = &element.name {
58                hasher.update(name.as_bytes());
59            }
60            hasher.update(element.states.to_hash_string().as_bytes());
61        }
62        for block in &self.text_blocks {
63            hasher.update(block.text.as_bytes());
64        }
65        hasher.update(self.page_signals.to_hash_string().as_bytes());
66        hex::encode(hasher.finalize())
67    }
68
69    pub fn get_element(&self, element_id: &str) -> Option<&UiElement> {
70        self.elements.iter().find(|e| e.id == element_id)
71    }
72
73    pub fn get_elements_by_role(&self, role: UiRole) -> Vec<&UiElement> {
74        self.elements
75            .iter()
76            .filter(|e| std::mem::discriminant(&e.role) == std::mem::discriminant(&role))
77            .collect()
78    }
79
80    pub fn interactive_elements(&self) -> Vec<&UiElement> {
81        self.elements
82            .iter()
83            .filter(|e| e.role.is_interactable() && e.is_interactable())
84            .collect()
85    }
86
87    pub fn estimate_tokens(&self, interactive_only: bool) -> usize {
88        let count = if interactive_only {
89            self.interactive_elements().len()
90        } else {
91            self.elements.len()
92        };
93        count * 20 + 30
94    }
95
96    pub fn average_confidence(&self) -> f32 {
97        if self.elements.is_empty() {
98            return 0.0;
99        }
100        let sum: f32 = self.elements.iter().map(|e| e.confidence).sum();
101        sum / self.elements.len() as f32
102    }
103
104    /// Format as compact text for LLM consumption.
105    ///
106    /// Format: `[el_0] Button "Submit" (120,340) focused`
107    pub fn format_compact(&self) -> String {
108        use std::fmt::Write;
109        let mut output = String::new();
110
111        // Page signals header
112        if self.page_signals.has_blocking_element() {
113            if self.page_signals.modal_present {
114                let _ = writeln!(output, "⚠ Modal dialog present");
115            }
116            if self.page_signals.cookie_banner {
117                let _ = writeln!(output, "⚠ Cookie banner present");
118            }
119        }
120        if self.page_signals.loading_indicator {
121            let _ = writeln!(output, "⏳ Page loading...");
122        }
123
124        // Use interactive-only if >40 elements (compact mode)
125        let elements: Vec<&UiElement> = if self.elements.len() > 40 {
126            self.interactive_elements()
127        } else {
128            self.elements.iter().collect()
129        };
130
131        for element in &elements {
132            let role_str = element.role.to_hash_string();
133            let name_str = element
134                .name
135                .as_deref()
136                .map(|n| {
137                    if n.len() > 50 {
138                        let truncated: String = n.chars().take(47).collect();
139                        format!(" \"{}...\"", truncated)
140                    } else {
141                        format!(" \"{}\"", n)
142                    }
143                })
144                .unwrap_or_default();
145
146            let (cx, cy) = element.bounds.center();
147            let pos_str = format!(" ({:.0},{:.0})", cx, cy);
148
149            let mut state_parts = Vec::new();
150            if element.states.focused {
151                state_parts.push("focused");
152            }
153            if !element.states.enabled {
154                state_parts.push("disabled");
155            }
156            if element.states.checked == Some(true) {
157                state_parts.push("checked");
158            }
159            if element.states.expanded == Some(true) {
160                state_parts.push("expanded");
161            }
162            let state_str = if state_parts.is_empty() {
163                String::new()
164            } else {
165                format!(" {}", state_parts.join(" "))
166            };
167
168            let _ = writeln!(
169                output,
170                "[{}] {}{}{}{}",
171                element.id, role_str, name_str, pos_str, state_str
172            );
173        }
174
175        output
176    }
177}
178
179/// A single interactable UI element.
180#[derive(Debug, Clone, Serialize, Deserialize)]
181pub struct UiElement {
182    pub id: String,
183    pub role: UiRole,
184    pub name: Option<String>,
185    /// Current value (for form inputs — text content, selected option, etc.)
186    pub value: Option<String>,
187    pub bounds: Bounds,
188    pub states: UiState,
189    pub confidence: f32,
190    pub source: ElementSource,
191    pub icon_type: Option<IconType>,
192    pub children: Vec<String>,
193    /// Original AX node ID for execution mapping.
194    pub ax_ref: Option<String>,
195}
196
197impl UiElement {
198    pub fn is_interactable(&self) -> bool {
199        self.states.enabled && self.bounds.width > 0.0 && self.bounds.height > 0.0
200    }
201
202    pub fn center(&self) -> (f64, f64) {
203        self.bounds.center()
204    }
205
206    pub fn accepts_text(&self) -> bool {
207        matches!(self.role, UiRole::TextInput)
208    }
209
210    pub fn is_clickable(&self) -> bool {
211        matches!(self.role, UiRole::Button | UiRole::Link)
212    }
213}
214
215/// Element roles (simplified from full ARIA).
216#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
217#[serde(rename_all = "snake_case")]
218pub enum UiRole {
219    Button,
220    Link,
221    TextInput,
222    Checkbox,
223    Radio,
224    Dropdown,
225    Menu,
226    MenuItem,
227    Tab,
228    Dialog,
229    Image,
230    Text,
231    Container,
232    List,
233    ListItem,
234    Table,
235    TableRow,
236    TableCell,
237    Toolbar,
238    Other(String),
239}
240
241impl UiRole {
242    pub fn from_ax_role(ax_role: &str) -> Self {
243        match ax_role.to_lowercase().as_str() {
244            "button" | "pushbutton" => UiRole::Button,
245            "link" | "weblink" => UiRole::Link,
246            "textbox" | "textfield" | "textarea" | "combobox" | "searchfield" => UiRole::TextInput,
247            "checkbox" => UiRole::Checkbox,
248            "radio" | "radiobutton" => UiRole::Radio,
249            "select" | "listbox" | "popupbutton" => UiRole::Dropdown,
250            "menu" | "menubar" => UiRole::Menu,
251            "menuitem" | "menuitemcheckbox" | "menuitemradio" => UiRole::MenuItem,
252            "tab" | "tabitem" => UiRole::Tab,
253            "dialog" | "alertdialog" | "sheet" => UiRole::Dialog,
254            "image" | "img" => UiRole::Image,
255            "statictext" | "label" | "heading" => UiRole::Text,
256            "group" | "generic" | "section" | "div" | "webarea" => UiRole::Container,
257            "list" => UiRole::List,
258            "listitem" => UiRole::ListItem,
259            "table" | "grid" => UiRole::Table,
260            "row" | "tablerow" => UiRole::TableRow,
261            "cell" | "tablecell" | "gridcell" => UiRole::TableCell,
262            "toolbar" => UiRole::Toolbar,
263            other => UiRole::Other(other.to_string()),
264        }
265    }
266
267    pub fn is_interactable(&self) -> bool {
268        matches!(
269            self,
270            UiRole::Button
271                | UiRole::Link
272                | UiRole::TextInput
273                | UiRole::Checkbox
274                | UiRole::Radio
275                | UiRole::Dropdown
276                | UiRole::MenuItem
277                | UiRole::Tab
278        )
279    }
280
281    pub fn to_hash_string(&self) -> String {
282        match self {
283            UiRole::Button => "button".to_string(),
284            UiRole::Link => "link".to_string(),
285            UiRole::TextInput => "text_input".to_string(),
286            UiRole::Checkbox => "checkbox".to_string(),
287            UiRole::Radio => "radio".to_string(),
288            UiRole::Dropdown => "dropdown".to_string(),
289            UiRole::Menu => "menu".to_string(),
290            UiRole::MenuItem => "menu_item".to_string(),
291            UiRole::Tab => "tab".to_string(),
292            UiRole::Dialog => "dialog".to_string(),
293            UiRole::Image => "image".to_string(),
294            UiRole::Text => "text".to_string(),
295            UiRole::Container => "container".to_string(),
296            UiRole::List => "list".to_string(),
297            UiRole::ListItem => "list_item".to_string(),
298            UiRole::Table => "table".to_string(),
299            UiRole::TableRow => "table_row".to_string(),
300            UiRole::TableCell => "table_cell".to_string(),
301            UiRole::Toolbar => "toolbar".to_string(),
302            UiRole::Other(s) => format!("other:{}", s),
303        }
304    }
305}
306
307/// Element state flags.
308#[derive(Debug, Clone, Default, Serialize, Deserialize)]
309pub struct UiState {
310    pub enabled: bool,
311    pub focused: bool,
312    pub selected: bool,
313    pub checked: Option<bool>,
314    pub expanded: Option<bool>,
315    pub readonly: bool,
316    pub required: bool,
317}
318
319impl UiState {
320    pub fn enabled() -> Self {
321        Self {
322            enabled: true,
323            ..Default::default()
324        }
325    }
326
327    pub fn disabled() -> Self {
328        Self {
329            enabled: false,
330            ..Default::default()
331        }
332    }
333
334    pub fn from_ax_states(
335        disabled: bool,
336        focused: bool,
337        selected: Option<bool>,
338        checked: Option<bool>,
339        expanded: Option<bool>,
340    ) -> Self {
341        Self {
342            enabled: !disabled,
343            focused,
344            selected: selected.unwrap_or(false),
345            checked,
346            expanded,
347            readonly: false,
348            required: false,
349        }
350    }
351
352    pub fn to_hash_string(&self) -> String {
353        fn opt_bool_str(opt: Option<bool>) -> &'static str {
354            match opt {
355                None => "none",
356                Some(true) => "true",
357                Some(false) => "false",
358            }
359        }
360        format!(
361            "en:{},fo:{},se:{},ch:{},ex:{},ro:{},rq:{}",
362            self.enabled,
363            self.focused,
364            self.selected,
365            opt_bool_str(self.checked),
366            opt_bool_str(self.expanded),
367            self.readonly,
368            self.required
369        )
370    }
371}
372
373#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
374#[serde(rename_all = "snake_case")]
375pub enum ElementSource {
376    AccessibilityTree,
377    VisualDetector,
378    Ocr,
379    Merged { sources: Vec<ElementSource> },
380}
381
382impl ElementSource {
383    pub fn base_confidence(&self) -> f32 {
384        match self {
385            ElementSource::AccessibilityTree => 0.90,
386            ElementSource::VisualDetector => 0.75,
387            ElementSource::Ocr => 0.70,
388            ElementSource::Merged { .. } => 0.98,
389        }
390    }
391}
392
393#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
394#[serde(rename_all = "snake_case")]
395pub enum IconType {
396    Close, Menu, Search, Back, Forward, Refresh, Settings, Share,
397    Download, Upload, Edit, Delete, Add, Remove, Expand, Collapse,
398    Play, Pause, Stop, Mute, Unmute, Fullscreen, ExitFullscreen,
399    Info, Help, Warning, Error, Success, Unknown,
400}
401
402#[derive(Debug, Clone, Serialize, Deserialize)]
403pub struct TextBlock {
404    pub text: String,
405    pub bounds: Bounds,
406    pub source: TextSource,
407    pub confidence: f32,
408}
409
410impl TextBlock {
411    pub fn from_ax(text: String, bounds: Bounds) -> Self {
412        Self {
413            text,
414            bounds,
415            source: TextSource::AccessibilityTree,
416            confidence: 1.0,
417        }
418    }
419}
420
421#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
422#[serde(rename_all = "snake_case")]
423pub enum TextSource {
424    AccessibilityTree,
425    Ocr,
426}
427
428#[derive(Debug, Clone, Default, Serialize, Deserialize)]
429pub struct PageSignals {
430    pub modal_present: bool,
431    pub cookie_banner: bool,
432    pub error_banner: bool,
433    pub loading_indicator: bool,
434    pub scroll_position: f32,
435    pub page_type_hint: Option<String>,
436}
437
438impl PageSignals {
439    pub fn has_blocking_element(&self) -> bool {
440        self.modal_present || self.cookie_banner
441    }
442
443    pub fn needs_special_handling(&self) -> bool {
444        matches!(
445            self.page_type_hint.as_deref(),
446            Some("login") | Some("checkout") | Some("payment")
447        )
448    }
449
450    pub fn to_hash_string(&self) -> String {
451        format!(
452            "mo:{},co:{},er:{},lo:{},sc:{:.2},ty:{}",
453            self.modal_present,
454            self.cookie_banner,
455            self.error_banner,
456            self.loading_indicator,
457            self.scroll_position,
458            self.page_type_hint.as_deref().unwrap_or("none")
459        )
460    }
461}
462
463#[cfg(test)]
464mod tests {
465    use super::*;
466
467    #[test]
468    fn test_ui_role_from_ax() {
469        assert_eq!(UiRole::from_ax_role("button"), UiRole::Button);
470        assert_eq!(UiRole::from_ax_role("textfield"), UiRole::TextInput);
471        assert!(matches!(UiRole::from_ax_role("custom"), UiRole::Other(_)));
472    }
473
474    #[test]
475    fn test_format_compact() {
476        let viewport = Viewport { width: 1280, height: 720, device_pixel_ratio: 2.0 };
477        let map = UiMap::new(
478            "https://example.com".to_string(),
479            vec![UiElement {
480                id: "el_0".to_string(),
481                role: UiRole::Button,
482                name: Some("Submit".to_string()),
483                value: None,
484                bounds: Bounds::new(100.0, 100.0, 80.0, 30.0),
485                states: UiState { focused: true, ..UiState::enabled() },
486                confidence: 0.95,
487                source: ElementSource::AccessibilityTree,
488                icon_type: None,
489                children: vec![],
490                ax_ref: None,
491            }],
492            vec![],
493            PageSignals::default(),
494            viewport,
495            String::new(),
496        );
497        let compact = map.format_compact();
498        assert!(compact.contains("[el_0]"));
499        assert!(compact.contains("button"));
500        assert!(compact.contains("Submit"));
501        assert!(compact.contains("focused"));
502    }
503}