1use super::config::BrowseConfig;
7use super::engine::{BrowserEngine, BrowserError, BrowserTab};
8use super::helpers;
9use super::tab_guard::TabGuard;
10use crate::tools::{AgentTool, AgentToolResult, ToolContext, ToolError};
11use async_trait::async_trait;
12use parking_lot::Mutex;
13use serde_json::{json, Value};
14use std::sync::Arc;
15use tokio::sync::oneshot;
16
17pub struct BrowseExtractTool {
22 engine: Arc<dyn BrowserEngine>,
23 config: BrowseConfig,
24 callbacks: super::callback_mixin::BrowseCallbacks,
26 tab_id_slot: Mutex<Arc<parking_lot::Mutex<Option<uuid::Uuid>>>>,
28}
29
30impl BrowseExtractTool {
31 pub fn new(engine: Arc<dyn BrowserEngine>) -> Self {
33 Self {
34 engine,
35 config: BrowseConfig::default(),
36 callbacks: super::callback_mixin::BrowseCallbacks::new(),
37 tab_id_slot: Mutex::new(Arc::new(parking_lot::Mutex::new(None))),
38 }
39 }
40
41 pub fn with_config(engine: Arc<dyn BrowserEngine>, config: BrowseConfig) -> Self {
43 Self {
44 engine,
45 config,
46 callbacks: super::callback_mixin::BrowseCallbacks::new(),
47 tab_id_slot: Mutex::new(Arc::new(parking_lot::Mutex::new(None))),
48 }
49 }
50}
51
52#[async_trait]
53impl AgentTool for BrowseExtractTool {
54 fn name(&self) -> &str {
55 "browse_extract"
56 }
57
58 fn label(&self) -> &str {
59 "Extract Page Data"
60 }
61
62 fn description(&self) -> &str {
63 "Extract structured data from a web page: links, text content, or elements matching \
64 a CSS selector. Use when you need specific data from a page rather than the full content. \
65 Supports extracting all matching elements or just the first match."
66 }
67
68 fn on_progress(&self, callback: crate::tools::ProgressCallback) {
69 self.callbacks.store_progress(callback);
70 }
71
72 fn on_browse_progress(&self, callback: Arc<dyn Fn(super::BrowseProgress) + Send + Sync>) {
73 self.callbacks.store_browse(callback);
74 }
75
76 fn set_tab_id_slot(&self, slot: Arc<parking_lot::Mutex<Option<uuid::Uuid>>>) {
77 *self.tab_id_slot.lock() = slot;
78 }
79
80 fn current_tab_id(&self) -> Option<uuid::Uuid> {
81 *self.tab_id_slot.lock().lock()
82 }
83
84 fn parameters_schema(&self) -> Value {
85 json!({
86 "type": "object",
87 "properties": {
88 "url": {
89 "type": "string",
90 "description": "URL of the page to extract from"
91 },
92 "selector": {
93 "type": "string",
94 "description": "CSS selector to match elements"
95 },
96 "extract": {
97 "type": "string",
98 "enum": ["links", "text", "elements", "markdown"],
99 "default": "text",
100 "description": "What to extract: 'links' (href + text), 'text' (textContent), 'elements' (tag + text + attrs), 'markdown' (innerHTML as markdown)"
101 },
102 "all": {
103 "type": "boolean",
104 "default": true,
105 "description": "Return all matches (true) or just the first (false)"
106 },
107 "timeout": {
108 "type": "integer",
109 "default": 30,
110 "description": "Maximum time in seconds"
111 }
112 },
113 "required": ["url", "selector"]
114 })
115 }
116
117 async fn execute(
118 &self,
119 _tool_call_id: &str,
120 params: Value,
121 _signal: Option<oneshot::Receiver<()>>,
122 _ctx: &ToolContext,
123 ) -> Result<AgentToolResult, ToolError> {
124 let url = params["url"]
125 .as_str()
126 .ok_or_else(|| "Missing required parameter: url".to_string())?;
127
128 let selector = params["selector"]
129 .as_str()
130 .ok_or_else(|| "Missing required parameter: selector".to_string())?;
131
132 let extract = params["extract"].as_str().unwrap_or("text");
133 let all = params["all"].as_bool().unwrap_or(true);
134 let timeout_secs = params["timeout"]
135 .as_u64()
136 .unwrap_or(self.config.page_timeout_secs);
137
138 tracing::info!(url = %url, selector = %selector, extract = %extract, "extracting page data");
139
140 let output = tokio::time::timeout(
142 std::time::Duration::from_secs(timeout_secs),
143 self.extract_from_new_tab(url, selector, extract, all),
144 )
145 .await
146 .map_err(|_| format!("Extract timed out after {}s", timeout_secs))??;
147
148 Ok(output)
149 }
150}
151
152impl BrowseExtractTool {
153 async fn extract_from_new_tab(
155 &self,
156 url: &str,
157 selector: &str,
158 extract: &str,
159 all: bool,
160 ) -> Result<AgentToolResult, ToolError> {
161 let raw_tab = self
162 .engine
163 .new_tab()
164 .await
165 .map_err(|e| format!("Failed to open browser tab: {}", e))?;
166
167 let tab_id = raw_tab.tab_id();
170 *self.tab_id_slot.lock().lock() = Some(tab_id);
171
172 self.callbacks
174 .register_on_registry(tab_id, self.engine.callback_registry().as_ref());
175
176 let guard = TabGuard::new(raw_tab);
177
178 let page = guard
179 .tab()
180 .goto(url)
181 .await
182 .map_err(|e| format!("Navigation failed: {}", e))?;
183
184 let output = extract_from_tab(guard.tab(), selector, extract, all)
185 .await
186 .map_err(|e: BrowserError| e.to_string())?;
187
188 let metadata_url = page.url.clone();
189 let metadata_title = page.title.clone();
190 let result_count = count_extracted_items(&output, extract);
191
192 guard.close().await;
193 *self.tab_id_slot.lock().lock() = None;
194
195 Ok(AgentToolResult::success(output).with_metadata(json!({
196 "url": metadata_url,
197 "title": metadata_title,
198 "selector": selector,
199 "extract": extract,
200 "result_count": result_count,
201 })))
202 }
203}
204
205fn count_extracted_items(output: &str, extract: &str) -> usize {
209 match extract {
210 "links" | "elements" => {
211 serde_json::from_str::<Vec<serde_json::Value>>(output)
213 .map(|v| v.len())
214 .unwrap_or(0)
215 }
216 _ => {
217 output.lines().filter(|l| !l.trim().is_empty()).count()
219 }
220 }
221}
222
223async fn extract_from_tab(
224 tab: &dyn BrowserTab,
225 selector: &str,
226 extract: &str,
227 all: bool,
228) -> Result<String, BrowserError> {
229 match extract {
230 "links" => {
231 let js = helpers::js_links_within(selector);
232 let value = tab.evaluate(&js).await?;
233 let links = helpers::parse_link_values(value);
234 let links = if all {
235 links
236 } else {
237 links.into_iter().take(1).collect()
238 };
239 let json_links: Vec<Value> = links
240 .iter()
241 .map(|(t, h)| json!({ "text": t, "href": h }))
242 .collect();
243 Ok(serde_json::to_string_pretty(&json_links).unwrap_or_default())
244 }
245 "elements" => {
246 let js = helpers::js_query_elements(selector);
247 let value = tab.evaluate(&js).await?;
248 let elements = helpers::parse_element_values(value);
249 let elements = if all {
250 elements
251 } else {
252 elements.into_iter().take(1).collect()
253 };
254 let json_elems: Vec<Value> = elements
255 .iter()
256 .map(|(tag, text, attrs)| json!({ "tag": tag, "text": text, "attributes": attrs }))
257 .collect();
258 Ok(serde_json::to_string_pretty(&json_elems).unwrap_or_default())
259 }
260 "markdown" => {
261 let texts = tab.query_all(selector).await?;
262 let texts = if all {
263 texts
264 } else {
265 texts.into_iter().take(1).collect()
266 };
267 Ok(texts.join("\n\n"))
268 }
269 _ => {
270 let texts = tab.query_all(selector).await?;
272 let texts = if all {
273 texts
274 } else {
275 texts.into_iter().take(1).collect()
276 };
277 Ok(texts.join("\n"))
278 }
279 }
280}