Skip to main content

spider_browser/ai/
observe.rs

1//! Observe interactive elements on the page.
2//!
3//! Ported from TypeScript `ai/observe.ts`.
4
5use crate::ai::llm_provider::{LLMMessage, LLMProvider};
6use crate::errors::Result;
7use crate::protocol::protocol_adapter::ProtocolAdapter;
8use serde::{Deserialize, Serialize};
9
10/// An observed interactive element on the page.
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ObserveResult {
13    /// CSS selector that can target this element.
14    pub selector: String,
15    /// HTML tag name.
16    pub tag: String,
17    /// Input type (for input elements).
18    #[serde(rename = "type", default)]
19    pub type_: String,
20    /// Visible text content (truncated).
21    #[serde(default)]
22    pub text: String,
23    /// aria-label attribute.
24    #[serde(rename = "ariaLabel", default)]
25    pub aria_label: String,
26    /// Placeholder text.
27    #[serde(default)]
28    pub placeholder: String,
29    /// href attribute (for links).
30    #[serde(default)]
31    pub href: String,
32    /// Current input value (for inputs/textareas).
33    #[serde(default)]
34    pub value: String,
35    /// Bounding rectangle in viewport coordinates.
36    pub rect: ElementRect,
37    /// Relevance score (0-1) when LLM ranking is used.
38    #[serde(default, skip_serializing_if = "Option::is_none")]
39    pub score: Option<f64>,
40}
41
42/// Bounding rectangle for an element.
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct ElementRect {
45    pub x: f64,
46    pub y: f64,
47    pub width: f64,
48    pub height: f64,
49}
50
51/// JavaScript snippet that collects all interactive elements on the page
52/// with selectors, text, and bounding rects.
53///
54/// Ported from TypeScript `utils/dom.ts` GET_INTERACTIVE_ELEMENTS.
55pub const GET_INTERACTIVE_ELEMENTS: &str = r#"
56(function() {
57  var interactiveSelectors = [
58    'a[href]',
59    'button',
60    'input',
61    'select',
62    'textarea',
63    '[role="button"]',
64    '[role="link"]',
65    '[role="tab"]',
66    '[role="menuitem"]',
67    '[role="checkbox"]',
68    '[role="radio"]',
69    '[role="switch"]',
70    '[role="combobox"]',
71    '[onclick]',
72    '[tabindex]',
73    'summary',
74    'details',
75    'label'
76  ];
77  var seen = new Set();
78  var results = [];
79  for (var s = 0; s < interactiveSelectors.length; s++) {
80    var sel = interactiveSelectors[s];
81    var els = document.querySelectorAll(sel);
82    for (var i = 0; i < els.length; i++) {
83      var el = els[i];
84      if (seen.has(el)) continue;
85      seen.add(el);
86      var r = el.getBoundingClientRect();
87      if (r.width === 0 && r.height === 0) continue;
88      if (r.bottom < 0 || r.right < 0) continue;
89
90      var tag = el.tagName.toLowerCase();
91      var type = el.getAttribute('type') || '';
92      var text = (el.textContent || '').trim().slice(0, 100);
93      var ariaLabel = el.getAttribute('aria-label') || '';
94      var placeholder = el.getAttribute('placeholder') || '';
95      var href = el.getAttribute('href') || '';
96      var value = (el instanceof HTMLInputElement || el instanceof HTMLTextAreaElement)
97        ? el.value.slice(0, 50) : '';
98
99      var cssSelector = tag;
100      var id = el.getAttribute('id');
101      if (id) {
102        cssSelector = '#' + CSS.escape(id);
103      } else {
104        var cls = el.getAttribute('class');
105        if (cls) {
106          var classes = cls.trim().split(/\s+/).slice(0, 2);
107          cssSelector = tag + classes.map(function(c) { return '.' + CSS.escape(c); }).join('');
108        }
109        var name = el.getAttribute('name');
110        if (name) {
111          cssSelector = tag + '[name="' + CSS.escape(name) + '"]';
112        }
113      }
114
115      results.push({
116        selector: cssSelector,
117        tag: tag,
118        type: type,
119        text: text,
120        ariaLabel: ariaLabel,
121        placeholder: placeholder,
122        href: href,
123        value: value,
124        rect: {
125          x: Math.round(r.x),
126          y: Math.round(r.y),
127          width: Math.round(r.width),
128          height: Math.round(r.height)
129        }
130      });
131    }
132  }
133  return results;
134})()
135"#;
136
137/// Discover interactive elements on the page.
138///
139/// Works WITHOUT an LLM -- injects a DOM traversal script to collect
140/// interactive elements (buttons, links, inputs) with selectors, text,
141/// and bounding rects.
142///
143/// When `instruction` is provided + LLM is available, adds ranking/filtering.
144pub async fn observe(
145    adapter: &ProtocolAdapter,
146    instruction: Option<&str>,
147    llm: Option<&dyn LLMProvider>,
148) -> Result<Vec<ObserveResult>> {
149    // Collect all interactive elements via DOM traversal
150    let raw = adapter.evaluate(GET_INTERACTIVE_ELEMENTS).await?;
151    let elements: Vec<ObserveResult> = serde_json::from_value(raw).unwrap_or_default();
152
153    if elements.is_empty() {
154        return Ok(vec![]);
155    }
156
157    // If no instruction or no LLM, return all elements
158    let (instruction, llm) = match (instruction, llm) {
159        (Some(inst), Some(provider)) => (inst, provider),
160        _ => return Ok(elements),
161    };
162
163    // Use LLM to rank/filter elements by relevance to the instruction
164    let element_summary: String = elements
165        .iter()
166        .enumerate()
167        .map(|(i, el)| {
168            let mut parts = vec![format!("[{i}] <{}>", el.tag)];
169            if !el.text.is_empty() {
170                parts.push(format!("text=\"{}\"", el.text));
171            }
172            if !el.aria_label.is_empty() {
173                parts.push(format!("aria=\"{}\"", el.aria_label));
174            }
175            if !el.placeholder.is_empty() {
176                parts.push(format!("placeholder=\"{}\"", el.placeholder));
177            }
178            if !el.href.is_empty() {
179                parts.push(format!("href=\"{}\"", el.href));
180            }
181            if !el.type_.is_empty() {
182                parts.push(format!("type=\"{}\"", el.type_));
183            }
184            parts.join(" ")
185        })
186        .collect::<Vec<_>>()
187        .join("\n");
188
189    #[derive(Deserialize)]
190    struct RankResponse {
191        #[serde(default)]
192        indices: Vec<usize>,
193    }
194
195    let messages = vec![
196        LLMMessage::system(
197            "You are an element selector. Given a list of page elements and an instruction, \
198             return a JSON object with an \"indices\" array of element indices that match the \
199             instruction. Order by relevance (most relevant first). Return {\"indices\": []} \
200             if none match.",
201        ),
202        LLMMessage::user(format!(
203            "Instruction: {instruction}\n\nElements:\n{element_summary}"
204        )),
205    ];
206
207    let response: RankResponse = crate::ai::llm_provider::chat_json(llm, &messages).await?;
208
209    let valid_indices: Vec<usize> = response
210        .indices
211        .into_iter()
212        .filter(|&i| i < elements.len())
213        .collect();
214
215    let total = valid_indices.len().max(1) as f64;
216    let results = valid_indices
217        .into_iter()
218        .enumerate()
219        .map(|(rank, idx)| {
220            let mut el = elements[idx].clone();
221            el.score = Some(1.0 - rank as f64 / total);
222            el
223        })
224        .collect();
225
226    Ok(results)
227}