Skip to main content

car_browser/perception/
ui_map.rs

1//! UI state representation for perception pipeline.
2//!
3//! UiMap is the core data structure that represents what's visible on screen
4//! in a structured, auditable format.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use sha2::{Digest, Sha256};
9
10use crate::models::{Bounds, Viewport};
11
12/// Complete UI state representation.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct UiMap {
15    pub id: String,
16    pub timestamp: DateTime<Utc>,
17    pub url: String,
18    pub elements: Vec<UiElement>,
19    pub text_blocks: Vec<TextBlock>,
20    pub page_signals: PageSignals,
21    pub viewport: Viewport,
22    pub content_hash: String,
23    pub screenshot_path: String,
24}
25
26impl UiMap {
27    pub fn new(
28        url: String,
29        elements: Vec<UiElement>,
30        text_blocks: Vec<TextBlock>,
31        page_signals: PageSignals,
32        viewport: Viewport,
33        screenshot_path: String,
34    ) -> Self {
35        let id = uuid::Uuid::new_v4().to_string();
36        let timestamp = Utc::now();
37        let mut map = Self {
38            id,
39            timestamp,
40            url,
41            elements,
42            text_blocks,
43            page_signals,
44            viewport,
45            content_hash: String::new(),
46            screenshot_path,
47        };
48        map.content_hash = map.compute_content_hash();
49        map
50    }
51
52    pub fn compute_content_hash(&self) -> String {
53        let mut hasher = Sha256::new();
54        for element in &self.elements {
55            hasher.update(element.id.as_bytes());
56            hasher.update(element.role.to_hash_string().as_bytes());
57            if let Some(name) = &element.name {
58                hasher.update(name.as_bytes());
59            }
60            hasher.update(element.states.to_hash_string().as_bytes());
61        }
62        for block in &self.text_blocks {
63            hasher.update(block.text.as_bytes());
64        }
65        hasher.update(self.page_signals.to_hash_string().as_bytes());
66        hex::encode(hasher.finalize())
67    }
68
69    pub fn get_element(&self, element_id: &str) -> Option<&UiElement> {
70        self.elements.iter().find(|e| e.id == element_id)
71    }
72
73    pub fn get_elements_by_role(&self, role: UiRole) -> Vec<&UiElement> {
74        self.elements
75            .iter()
76            .filter(|e| std::mem::discriminant(&e.role) == std::mem::discriminant(&role))
77            .collect()
78    }
79
80    pub fn interactive_elements(&self) -> Vec<&UiElement> {
81        self.elements
82            .iter()
83            .filter(|e| e.role.is_interactable() && e.is_interactable())
84            .collect()
85    }
86
87    pub fn estimate_tokens(&self, interactive_only: bool) -> usize {
88        let count = if interactive_only {
89            self.interactive_elements().len()
90        } else {
91            self.elements.len()
92        };
93        count * 20 + 30
94    }
95
96    pub fn average_confidence(&self) -> f32 {
97        if self.elements.is_empty() {
98            return 0.0;
99        }
100        let sum: f32 = self.elements.iter().map(|e| e.confidence).sum();
101        sum / self.elements.len() as f32
102    }
103
104    /// Format as a structured page summary for LLM consumption.
105    /// Includes: page signals, visible text content, and interactive elements.
106    /// Targets ~4KB output (vs 86KB for raw accessibility tree).
107    pub fn format_summary(&self) -> String {
108        use std::fmt::Write;
109        let mut output = String::new();
110
111        // Page signals
112        if self.page_signals.has_blocking_element() {
113            if self.page_signals.modal_present {
114                let _ = writeln!(output, "⚠ Modal dialog present");
115            }
116            if self.page_signals.cookie_banner {
117                let _ = writeln!(output, "⚠ Cookie banner present");
118            }
119        }
120        if self.page_signals.loading_indicator {
121            let _ = writeln!(output, "⏳ Page loading...");
122        }
123
124        // Visible text content — what a user would actually see
125        let _ = writeln!(output, "\n## Visible Text");
126        let mut seen_texts: std::collections::HashSet<String> = std::collections::HashSet::new();
127        for el in &self.elements {
128            if let Some(ref name) = el.name {
129                let text = name.trim().to_string();
130                if !text.is_empty() && text.len() > 1 && seen_texts.insert(text.clone()) {
131                    let role = el.role.to_hash_string();
132                    let truncated = if text.len() > 80 {
133                        let end = text.floor_char_boundary(77);
134                        format!("{}...", &text[..end])
135                    } else {
136                        text
137                    };
138                    let _ = writeln!(output, "  ({}) {}", role, truncated);
139                }
140            }
141        }
142
143        // Interactive elements — what the agent can click/type
144        let _ = writeln!(output, "\n## Interactive Elements");
145        let interactive = self.interactive_elements();
146        for (i, el) in interactive.iter().enumerate().take(50) {
147            let role_str = el.role.to_hash_string();
148            let name_str = el
149                .name
150                .as_deref()
151                .map(|n| {
152                    if n.len() > 40 {
153                        let end = n.floor_char_boundary(37);
154                        format!(" \"{}...\"", &n[..end])
155                    } else {
156                        format!(" \"{}\"", n)
157                    }
158                })
159                .unwrap_or_default();
160            let mut state_parts = Vec::new();
161            if element_states_for_summary(&el.states, &mut state_parts) {
162                let _ = writeln!(
163                    output,
164                    "[{}] {}{} {}",
165                    el.id,
166                    role_str,
167                    name_str,
168                    state_parts.join(" ")
169                );
170            } else {
171                let _ = writeln!(output, "[{}] {}{}", el.id, role_str, name_str);
172            }
173            let _ = i; // suppress unused
174        }
175        if interactive.len() > 50 {
176            let _ = writeln!(
177                output,
178                "  ... and {} more interactive elements",
179                interactive.len() - 50
180            );
181        }
182
183        output
184    }
185
186    /// Format as compact text for LLM consumption.
187    ///
188    /// Format: `[el_0] Button "Submit" (120,340) focused`
189    pub fn format_compact(&self) -> String {
190        use std::fmt::Write;
191        let mut output = String::new();
192
193        // Page signals header
194        if self.page_signals.has_blocking_element() {
195            if self.page_signals.modal_present {
196                let _ = writeln!(output, "⚠ Modal dialog present");
197            }
198            if self.page_signals.cookie_banner {
199                let _ = writeln!(output, "⚠ Cookie banner present");
200            }
201        }
202        if self.page_signals.loading_indicator {
203            let _ = writeln!(output, "⏳ Page loading...");
204        }
205
206        // Use interactive-only if >40 elements (compact mode)
207        let elements: Vec<&UiElement> = if self.elements.len() > 40 {
208            self.interactive_elements()
209        } else {
210            self.elements.iter().collect()
211        };
212
213        for element in &elements {
214            let role_str = element.role.to_hash_string();
215            let name_str = element
216                .name
217                .as_deref()
218                .map(|n| {
219                    if n.len() > 50 {
220                        let truncated: String = n.chars().take(47).collect();
221                        format!(" \"{}...\"", truncated)
222                    } else {
223                        format!(" \"{}\"", n)
224                    }
225                })
226                .unwrap_or_default();
227
228            let (cx, cy) = element.bounds.center();
229            let pos_str = format!(" ({:.0},{:.0})", cx, cy);
230
231            let mut state_parts = Vec::new();
232            if element.states.focused {
233                state_parts.push("focused");
234            }
235            if !element.states.enabled {
236                state_parts.push("disabled");
237            }
238            if element.states.checked == Some(true) {
239                state_parts.push("checked");
240            }
241            if element.states.expanded == Some(true) {
242                state_parts.push("expanded");
243            }
244            let state_str = if state_parts.is_empty() {
245                String::new()
246            } else {
247                format!(" {}", state_parts.join(" "))
248            };
249
250            let _ = writeln!(
251                output,
252                "[{}] {}{}{}{}",
253                element.id, role_str, name_str, pos_str, state_str
254            );
255        }
256
257        output
258    }
259}
260
261fn element_states_for_summary(states: &UiState, parts: &mut Vec<&'static str>) -> bool {
262    if states.focused {
263        parts.push("focused");
264    }
265    if !states.enabled {
266        parts.push("disabled");
267    }
268    if states.checked == Some(true) {
269        parts.push("checked");
270    }
271    if states.expanded == Some(true) {
272        parts.push("expanded");
273    }
274    !parts.is_empty()
275}
276
277/// A single interactable UI element.
278#[derive(Debug, Clone, Serialize, Deserialize)]
279pub struct UiElement {
280    pub id: String,
281    pub role: UiRole,
282    pub name: Option<String>,
283    /// Current value (for form inputs — text content, selected option, etc.)
284    pub value: Option<String>,
285    pub bounds: Bounds,
286    pub states: UiState,
287    pub confidence: f32,
288    pub source: ElementSource,
289    pub icon_type: Option<IconType>,
290    pub children: Vec<String>,
291    /// Original AX node ID for execution mapping.
292    pub ax_ref: Option<String>,
293}
294
295impl UiElement {
296    pub fn is_interactable(&self) -> bool {
297        self.states.enabled && self.bounds.width > 0.0 && self.bounds.height > 0.0
298    }
299
300    pub fn center(&self) -> (f64, f64) {
301        self.bounds.center()
302    }
303
304    pub fn accepts_text(&self) -> bool {
305        matches!(self.role, UiRole::TextInput)
306    }
307
308    pub fn is_clickable(&self) -> bool {
309        matches!(self.role, UiRole::Button | UiRole::Link)
310    }
311}
312
313/// Element roles (simplified from full ARIA).
314#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
315#[serde(rename_all = "snake_case")]
316pub enum UiRole {
317    Button,
318    Link,
319    TextInput,
320    Checkbox,
321    Radio,
322    Dropdown,
323    Menu,
324    MenuItem,
325    Tab,
326    Dialog,
327    Image,
328    Text,
329    Container,
330    List,
331    ListItem,
332    Table,
333    TableRow,
334    TableCell,
335    Toolbar,
336    Other(String),
337}
338
339impl UiRole {
340    pub fn from_ax_role(ax_role: &str) -> Self {
341        match ax_role.to_lowercase().as_str() {
342            "button" | "pushbutton" => UiRole::Button,
343            "link" | "weblink" => UiRole::Link,
344            "textbox" | "textfield" | "textarea" | "combobox" | "searchfield" => UiRole::TextInput,
345            "checkbox" => UiRole::Checkbox,
346            "radio" | "radiobutton" => UiRole::Radio,
347            "select" | "listbox" | "popupbutton" => UiRole::Dropdown,
348            "menu" | "menubar" => UiRole::Menu,
349            "menuitem" | "menuitemcheckbox" | "menuitemradio" => UiRole::MenuItem,
350            "tab" | "tabitem" => UiRole::Tab,
351            "dialog" | "alertdialog" | "sheet" => UiRole::Dialog,
352            "image" | "img" => UiRole::Image,
353            "statictext" | "label" | "heading" => UiRole::Text,
354            "group" | "generic" | "section" | "div" | "webarea" => UiRole::Container,
355            "list" => UiRole::List,
356            "listitem" => UiRole::ListItem,
357            "table" | "grid" => UiRole::Table,
358            "row" | "tablerow" => UiRole::TableRow,
359            "cell" | "tablecell" | "gridcell" => UiRole::TableCell,
360            "toolbar" => UiRole::Toolbar,
361            other => UiRole::Other(other.to_string()),
362        }
363    }
364
365    pub fn is_interactable(&self) -> bool {
366        matches!(
367            self,
368            UiRole::Button
369                | UiRole::Link
370                | UiRole::TextInput
371                | UiRole::Checkbox
372                | UiRole::Radio
373                | UiRole::Dropdown
374                | UiRole::MenuItem
375                | UiRole::Tab
376        )
377    }
378
379    pub fn to_hash_string(&self) -> String {
380        match self {
381            UiRole::Button => "button".to_string(),
382            UiRole::Link => "link".to_string(),
383            UiRole::TextInput => "text_input".to_string(),
384            UiRole::Checkbox => "checkbox".to_string(),
385            UiRole::Radio => "radio".to_string(),
386            UiRole::Dropdown => "dropdown".to_string(),
387            UiRole::Menu => "menu".to_string(),
388            UiRole::MenuItem => "menu_item".to_string(),
389            UiRole::Tab => "tab".to_string(),
390            UiRole::Dialog => "dialog".to_string(),
391            UiRole::Image => "image".to_string(),
392            UiRole::Text => "text".to_string(),
393            UiRole::Container => "container".to_string(),
394            UiRole::List => "list".to_string(),
395            UiRole::ListItem => "list_item".to_string(),
396            UiRole::Table => "table".to_string(),
397            UiRole::TableRow => "table_row".to_string(),
398            UiRole::TableCell => "table_cell".to_string(),
399            UiRole::Toolbar => "toolbar".to_string(),
400            UiRole::Other(s) => format!("other:{}", s),
401        }
402    }
403}
404
405/// Element state flags.
406#[derive(Debug, Clone, Default, Serialize, Deserialize)]
407pub struct UiState {
408    pub enabled: bool,
409    pub focused: bool,
410    pub selected: bool,
411    pub checked: Option<bool>,
412    pub expanded: Option<bool>,
413    pub readonly: bool,
414    pub required: bool,
415}
416
417impl UiState {
418    pub fn enabled() -> Self {
419        Self {
420            enabled: true,
421            ..Default::default()
422        }
423    }
424
425    pub fn disabled() -> Self {
426        Self {
427            enabled: false,
428            ..Default::default()
429        }
430    }
431
432    pub fn from_ax_states(
433        disabled: bool,
434        focused: bool,
435        selected: Option<bool>,
436        checked: Option<bool>,
437        expanded: Option<bool>,
438    ) -> Self {
439        Self {
440            enabled: !disabled,
441            focused,
442            selected: selected.unwrap_or(false),
443            checked,
444            expanded,
445            readonly: false,
446            required: false,
447        }
448    }
449
450    pub fn to_hash_string(&self) -> String {
451        fn opt_bool_str(opt: Option<bool>) -> &'static str {
452            match opt {
453                None => "none",
454                Some(true) => "true",
455                Some(false) => "false",
456            }
457        }
458        format!(
459            "en:{},fo:{},se:{},ch:{},ex:{},ro:{},rq:{}",
460            self.enabled,
461            self.focused,
462            self.selected,
463            opt_bool_str(self.checked),
464            opt_bool_str(self.expanded),
465            self.readonly,
466            self.required
467        )
468    }
469}
470
471#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
472#[serde(rename_all = "snake_case")]
473pub enum ElementSource {
474    AccessibilityTree,
475    VisualDetector,
476    Ocr,
477    Merged { sources: Vec<ElementSource> },
478}
479
480impl ElementSource {
481    pub fn base_confidence(&self) -> f32 {
482        match self {
483            ElementSource::AccessibilityTree => 0.90,
484            ElementSource::VisualDetector => 0.75,
485            ElementSource::Ocr => 0.70,
486            ElementSource::Merged { .. } => 0.98,
487        }
488    }
489}
490
491#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
492#[serde(rename_all = "snake_case")]
493pub enum IconType {
494    Close,
495    Menu,
496    Search,
497    Back,
498    Forward,
499    Refresh,
500    Settings,
501    Share,
502    Download,
503    Upload,
504    Edit,
505    Delete,
506    Add,
507    Remove,
508    Expand,
509    Collapse,
510    Play,
511    Pause,
512    Stop,
513    Mute,
514    Unmute,
515    Fullscreen,
516    ExitFullscreen,
517    Info,
518    Help,
519    Warning,
520    Error,
521    Success,
522    Unknown,
523}
524
525#[derive(Debug, Clone, Serialize, Deserialize)]
526pub struct TextBlock {
527    pub text: String,
528    pub bounds: Bounds,
529    pub source: TextSource,
530    pub confidence: f32,
531}
532
533impl TextBlock {
534    pub fn from_ax(text: String, bounds: Bounds) -> Self {
535        Self {
536            text,
537            bounds,
538            source: TextSource::AccessibilityTree,
539            confidence: 1.0,
540        }
541    }
542}
543
544#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
545#[serde(rename_all = "snake_case")]
546pub enum TextSource {
547    AccessibilityTree,
548    Ocr,
549}
550
551#[derive(Debug, Clone, Default, Serialize, Deserialize)]
552pub struct PageSignals {
553    pub modal_present: bool,
554    pub cookie_banner: bool,
555    pub error_banner: bool,
556    pub loading_indicator: bool,
557    pub scroll_position: f32,
558    pub page_type_hint: Option<String>,
559}
560
561impl PageSignals {
562    pub fn has_blocking_element(&self) -> bool {
563        self.modal_present || self.cookie_banner
564    }
565
566    pub fn needs_special_handling(&self) -> bool {
567        matches!(
568            self.page_type_hint.as_deref(),
569            Some("login") | Some("checkout") | Some("payment")
570        )
571    }
572
573    pub fn to_hash_string(&self) -> String {
574        format!(
575            "mo:{},co:{},er:{},lo:{},sc:{:.2},ty:{}",
576            self.modal_present,
577            self.cookie_banner,
578            self.error_banner,
579            self.loading_indicator,
580            self.scroll_position,
581            self.page_type_hint.as_deref().unwrap_or("none")
582        )
583    }
584}
585
586#[cfg(test)]
587mod tests {
588    use super::*;
589
590    #[test]
591    fn test_ui_role_from_ax() {
592        assert_eq!(UiRole::from_ax_role("button"), UiRole::Button);
593        assert_eq!(UiRole::from_ax_role("textfield"), UiRole::TextInput);
594        assert!(matches!(UiRole::from_ax_role("custom"), UiRole::Other(_)));
595    }
596
597    #[test]
598    fn test_format_compact() {
599        let viewport = Viewport {
600            width: 1280,
601            height: 720,
602            device_pixel_ratio: 2.0,
603        };
604        let map = UiMap::new(
605            "https://example.com".to_string(),
606            vec![UiElement {
607                id: "el_0".to_string(),
608                role: UiRole::Button,
609                name: Some("Submit".to_string()),
610                value: None,
611                bounds: Bounds::new(100.0, 100.0, 80.0, 30.0),
612                states: UiState {
613                    focused: true,
614                    ..UiState::enabled()
615                },
616                confidence: 0.95,
617                source: ElementSource::AccessibilityTree,
618                icon_type: None,
619                children: vec![],
620                ax_ref: None,
621            }],
622            vec![],
623            PageSignals::default(),
624            viewport,
625            String::new(),
626        );
627        let compact = map.format_compact();
628        assert!(compact.contains("[el_0]"));
629        assert!(compact.contains("button"));
630        assert!(compact.contains("Submit"));
631        assert!(compact.contains("focused"));
632    }
633}