Skip to main content

oxi_agent/tools/browse/
browse_extract_tool.rs

1//! Browse extract tool — extract structured data from a web page.
2//!
3//! All extraction is done via the already-loaded tab's JavaScript engine —
4//! no engine-level methods that would open additional tabs.
5
6use super::config::BrowseConfig;
7use super::engine::{BrowserEngine, BrowserError, BrowserTab};
8use super::helpers;
9use super::tab_guard::TabGuard;
10use crate::tools::{AgentTool, AgentToolResult, ToolContext, ToolError};
11use async_trait::async_trait;
12use parking_lot::Mutex;
13use serde_json::{json, Value};
14use std::sync::Arc;
15use tokio::sync::oneshot;
16
17/// Extract structured data from a web page using CSS selectors.
18///
19/// Returns links, text content, or element metadata for all elements
20/// matching the given CSS selector.
21pub struct BrowseExtractTool {
22    engine: Arc<dyn BrowserEngine>,
23    config: BrowseConfig,
24    /// Shared callback management (progress + browse progress).
25    callbacks: super::callback_mixin::BrowseCallbacks,
26    /// Shared slot for the current tab's ID.
27    tab_id_slot: Mutex<Arc<parking_lot::Mutex<Option<uuid::Uuid>>>>,
28}
29
30impl BrowseExtractTool {
31    /// Create with the given engine and default config.
32    pub fn new(engine: Arc<dyn BrowserEngine>) -> Self {
33        Self {
34            engine,
35            config: BrowseConfig::default(),
36            callbacks: super::callback_mixin::BrowseCallbacks::new(),
37            tab_id_slot: Mutex::new(Arc::new(parking_lot::Mutex::new(None))),
38        }
39    }
40
41    /// Create with custom configuration.
42    pub fn with_config(engine: Arc<dyn BrowserEngine>, config: BrowseConfig) -> Self {
43        Self {
44            engine,
45            config,
46            callbacks: super::callback_mixin::BrowseCallbacks::new(),
47            tab_id_slot: Mutex::new(Arc::new(parking_lot::Mutex::new(None))),
48        }
49    }
50}
51
52#[async_trait]
53impl AgentTool for BrowseExtractTool {
54    fn name(&self) -> &str {
55        "browse_extract"
56    }
57
58    fn label(&self) -> &str {
59        "Extract Page Data"
60    }
61
62    fn description(&self) -> &str {
63        "Extract structured data from a web page: links, text content, or elements matching \
64         a CSS selector. Use when you need specific data from a page rather than the full content. \
65         Supports extracting all matching elements or just the first match."
66    }
67
68    fn on_progress(&self, callback: crate::tools::ProgressCallback) {
69        self.callbacks.store_progress(callback);
70    }
71
72    fn on_browse_progress(&self, callback: Arc<dyn Fn(super::BrowseProgress) + Send + Sync>) {
73        self.callbacks.store_browse(callback);
74    }
75
76    fn set_tab_id_slot(&self, slot: Arc<parking_lot::Mutex<Option<uuid::Uuid>>>) {
77        *self.tab_id_slot.lock() = slot;
78    }
79
80    fn current_tab_id(&self) -> Option<uuid::Uuid> {
81        *self.tab_id_slot.lock().lock()
82    }
83
84    fn parameters_schema(&self) -> Value {
85        json!({
86            "type": "object",
87            "properties": {
88                "url": {
89                    "type": "string",
90                    "description": "URL of the page to extract from"
91                },
92                "selector": {
93                    "type": "string",
94                    "description": "CSS selector to match elements"
95                },
96                "extract": {
97                    "type": "string",
98                    "enum": ["links", "text", "elements", "markdown"],
99                    "default": "text",
100                    "description": "What to extract: 'links' (href + text), 'text' (textContent), 'elements' (tag + text + attrs), 'markdown' (innerHTML as markdown)"
101                },
102                "all": {
103                    "type": "boolean",
104                    "default": true,
105                    "description": "Return all matches (true) or just the first (false)"
106                },
107                "timeout": {
108                    "type": "integer",
109                    "default": 30,
110                    "description": "Maximum time in seconds"
111                }
112            },
113            "required": ["url", "selector"]
114        })
115    }
116
117    async fn execute(
118        &self,
119        _tool_call_id: &str,
120        params: Value,
121        _signal: Option<oneshot::Receiver<()>>,
122        _ctx: &ToolContext,
123    ) -> Result<AgentToolResult, ToolError> {
124        let url = params["url"]
125            .as_str()
126            .ok_or_else(|| "Missing required parameter: url".to_string())?;
127
128        let selector = params["selector"]
129            .as_str()
130            .ok_or_else(|| "Missing required parameter: selector".to_string())?;
131
132        let extract = params["extract"].as_str().unwrap_or("text");
133        let all = params["all"].as_bool().unwrap_or(true);
134        let timeout_secs = params["timeout"]
135            .as_u64()
136            .unwrap_or(self.config.page_timeout_secs);
137
138        tracing::info!(url = %url, selector = %selector, extract = %extract, "extracting page data");
139
140        // Wrap the entire operation in a timeout
141        let output = tokio::time::timeout(
142            std::time::Duration::from_secs(timeout_secs),
143            self.extract_from_new_tab(url, selector, extract, all),
144        )
145        .await
146        .map_err(|_| format!("Extract timed out after {}s", timeout_secs))??;
147
148        Ok(output)
149    }
150}
151
152impl BrowseExtractTool {
153    /// Open a tab, navigate, extract, close — one tab, one flow.
154    async fn extract_from_new_tab(
155        &self,
156        url: &str,
157        selector: &str,
158        extract: &str,
159        all: bool,
160    ) -> Result<AgentToolResult, ToolError> {
161        let raw_tab = self
162            .engine
163            .new_tab()
164            .await
165            .map_err(|e| format!("Failed to open browser tab: {}", e))?;
166
167        // Store tab_id so the agent loop can include it in
168        // ToolExecutionUpdate events.
169        let tab_id = raw_tab.tab_id();
170        *self.tab_id_slot.lock().lock() = Some(tab_id);
171
172        // Register progress callbacks on the tab via the engine's registry.
173        self.callbacks
174            .register_on_registry(tab_id, self.engine.callback_registry().as_ref());
175
176        let guard = TabGuard::new(raw_tab);
177
178        let page = guard
179            .tab()
180            .goto(url)
181            .await
182            .map_err(|e| format!("Navigation failed: {}", e))?;
183
184        let output = extract_from_tab(guard.tab(), selector, extract, all)
185            .await
186            .map_err(|e: BrowserError| e.to_string())?;
187
188        let metadata_url = page.url.clone();
189        let metadata_title = page.title.clone();
190        let result_count = count_extracted_items(&output, extract);
191
192        guard.close().await;
193        *self.tab_id_slot.lock().lock() = None;
194
195        Ok(AgentToolResult::success(output).with_metadata(json!({
196            "url": metadata_url,
197            "title": metadata_title,
198            "selector": selector,
199            "extract": extract,
200            "result_count": result_count,
201        })))
202    }
203}
204
205// ── Extraction logic ──────────────────────────────────────────────────────────
206
207/// Count items in extraction output for metadata.
208fn count_extracted_items(output: &str, extract: &str) -> usize {
209    match extract {
210        "links" | "elements" => {
211            // JSON array output — count top-level array elements.
212            serde_json::from_str::<Vec<serde_json::Value>>(output)
213                .map(|v| v.len())
214                .unwrap_or(0)
215        }
216        _ => {
217            // Text/markdown — count non-empty lines.
218            output.lines().filter(|l| !l.trim().is_empty()).count()
219        }
220    }
221}
222
223async fn extract_from_tab(
224    tab: &dyn BrowserTab,
225    selector: &str,
226    extract: &str,
227    all: bool,
228) -> Result<String, BrowserError> {
229    match extract {
230        "links" => {
231            let js = helpers::js_links_within(selector);
232            let value = tab.evaluate(&js).await?;
233            let links = helpers::parse_link_values(value);
234            let links = if all {
235                links
236            } else {
237                links.into_iter().take(1).collect()
238            };
239            let json_links: Vec<Value> = links
240                .iter()
241                .map(|(t, h)| json!({ "text": t, "href": h }))
242                .collect();
243            Ok(serde_json::to_string_pretty(&json_links).unwrap_or_default())
244        }
245        "elements" => {
246            let js = helpers::js_query_elements(selector);
247            let value = tab.evaluate(&js).await?;
248            let elements = helpers::parse_element_values(value);
249            let elements = if all {
250                elements
251            } else {
252                elements.into_iter().take(1).collect()
253            };
254            let json_elems: Vec<Value> = elements
255                .iter()
256                .map(|(tag, text, attrs)| json!({ "tag": tag, "text": text, "attributes": attrs }))
257                .collect();
258            Ok(serde_json::to_string_pretty(&json_elems).unwrap_or_default())
259        }
260        "markdown" => {
261            let texts = tab.query_all(selector).await?;
262            let texts = if all {
263                texts
264            } else {
265                texts.into_iter().take(1).collect()
266            };
267            Ok(texts.join("\n\n"))
268        }
269        _ => {
270            // "text" (default)
271            let texts = tab.query_all(selector).await?;
272            let texts = if all {
273                texts
274            } else {
275                texts.into_iter().take(1).collect()
276            };
277            Ok(texts.join("\n"))
278        }
279    }
280}