1use super::config::BrowseConfig;
7use super::engine::{BrowserEngine, BrowserError, BrowserTab};
8use super::helpers;
9use super::tab_guard::TabGuard;
10use crate::tools::{AgentTool, AgentToolResult, ToolContext, ToolError};
11use async_trait::async_trait;
12use serde_json::{json, Value};
13use parking_lot::Mutex;
14use std::sync::Arc;
15use tokio::sync::oneshot;
16
17pub struct BrowseExtractTool {
22 engine: Arc<dyn BrowserEngine>,
23 config: BrowseConfig,
24 callbacks: super::callback_mixin::BrowseCallbacks,
26 tab_id_slot: Mutex<Arc<parking_lot::Mutex<Option<uuid::Uuid>>>>,
28}
29
30impl BrowseExtractTool {
31 pub fn new(engine: Arc<dyn BrowserEngine>) -> Self {
33 Self {
34 engine,
35 config: BrowseConfig::default(),
36 callbacks: super::callback_mixin::BrowseCallbacks::new(),
37 tab_id_slot: Mutex::new(Arc::new(parking_lot::Mutex::new(None))),
38 }
39 }
40
41 pub fn with_config(engine: Arc<dyn BrowserEngine>, config: BrowseConfig) -> Self {
43 Self {
44 engine,
45 config,
46 callbacks: super::callback_mixin::BrowseCallbacks::new(),
47 tab_id_slot: Mutex::new(Arc::new(parking_lot::Mutex::new(None))),
48 }
49 }
50}
51
52#[async_trait]
53impl AgentTool for BrowseExtractTool {
54 fn name(&self) -> &str {
55 "browse_extract"
56 }
57
58 fn label(&self) -> &str {
59 "Extract Page Data"
60 }
61
62 fn description(&self) -> &str {
63 "Extract structured data from a web page: links, text content, or elements matching \
64 a CSS selector. Use when you need specific data from a page rather than the full content. \
65 Supports extracting all matching elements or just the first match."
66 }
67
68 fn on_progress(&self, callback: crate::tools::ProgressCallback) {
69 self.callbacks.store_progress(callback);
70 }
71
72 fn on_browse_progress(
73 &self,
74 callback: Arc<dyn Fn(super::BrowseProgress) + Send + Sync>,
75 ) {
76 self.callbacks.store_browse(callback);
77 }
78
79 fn set_tab_id_slot(&self, slot: Arc<parking_lot::Mutex<Option<uuid::Uuid>>>) {
80 *self.tab_id_slot.lock() = slot;
81 }
82
83 fn current_tab_id(&self) -> Option<uuid::Uuid> {
84 *self.tab_id_slot.lock().lock()
85 }
86
87 fn parameters_schema(&self) -> Value {
88 json!({
89 "type": "object",
90 "properties": {
91 "url": {
92 "type": "string",
93 "description": "URL of the page to extract from"
94 },
95 "selector": {
96 "type": "string",
97 "description": "CSS selector to match elements"
98 },
99 "extract": {
100 "type": "string",
101 "enum": ["links", "text", "elements", "markdown"],
102 "default": "text",
103 "description": "What to extract: 'links' (href + text), 'text' (textContent), 'elements' (tag + text + attrs), 'markdown' (innerHTML as markdown)"
104 },
105 "all": {
106 "type": "boolean",
107 "default": true,
108 "description": "Return all matches (true) or just the first (false)"
109 },
110 "timeout": {
111 "type": "integer",
112 "default": 30,
113 "description": "Maximum time in seconds"
114 }
115 },
116 "required": ["url", "selector"]
117 })
118 }
119
120 async fn execute(
121 &self,
122 _tool_call_id: &str,
123 params: Value,
124 _signal: Option<oneshot::Receiver<()>>,
125 _ctx: &ToolContext,
126 ) -> Result<AgentToolResult, ToolError> {
127 let url = params["url"]
128 .as_str()
129 .ok_or_else(|| "Missing required parameter: url".to_string())?;
130
131 let selector = params["selector"]
132 .as_str()
133 .ok_or_else(|| "Missing required parameter: selector".to_string())?;
134
135 let extract = params["extract"].as_str().unwrap_or("text");
136 let all = params["all"].as_bool().unwrap_or(true);
137 let timeout_secs = params["timeout"]
138 .as_u64()
139 .unwrap_or(self.config.page_timeout_secs);
140
141 tracing::info!(url = %url, selector = %selector, extract = %extract, "extracting page data");
142
143 let output = tokio::time::timeout(
145 std::time::Duration::from_secs(timeout_secs),
146 self.extract_from_new_tab(url, selector, extract, all),
147 )
148 .await
149 .map_err(|_| format!("Extract timed out after {}s", timeout_secs))??;
150
151 Ok(output)
152 }
153}
154
155impl BrowseExtractTool {
156 async fn extract_from_new_tab(
158 &self,
159 url: &str,
160 selector: &str,
161 extract: &str,
162 all: bool,
163 ) -> Result<AgentToolResult, ToolError> {
164 let raw_tab = self
165 .engine
166 .new_tab()
167 .await
168 .map_err(|e| format!("Failed to open browser tab: {}", e))?;
169
170 let tab_id = raw_tab.tab_id();
173 *self.tab_id_slot.lock().lock() = Some(tab_id);
174
175 self.callbacks.register_on_registry(
177 tab_id,
178 self.engine.callback_registry().as_ref(),
179 );
180
181 let guard = TabGuard::new(raw_tab);
182
183 let page = guard
184 .tab()
185 .goto(url)
186 .await
187 .map_err(|e| format!("Navigation failed: {}", e))?;
188
189 let output = extract_from_tab(guard.tab(), selector, extract, all)
190 .await
191 .map_err(|e: BrowserError| e.to_string())?;
192
193 let metadata_url = page.url.clone();
194 let metadata_title = page.title.clone();
195 let result_count = count_extracted_items(&output, extract);
196
197 guard.close().await;
198 *self.tab_id_slot.lock().lock() = None;
199
200 Ok(AgentToolResult::success(output).with_metadata(json!({
201 "url": metadata_url,
202 "title": metadata_title,
203 "selector": selector,
204 "extract": extract,
205 "result_count": result_count,
206 })))
207 }
208}
209
210fn count_extracted_items(output: &str, extract: &str) -> usize {
214 match extract {
215 "links" | "elements" => {
216 serde_json::from_str::<Vec<serde_json::Value>>(output)
218 .map(|v| v.len())
219 .unwrap_or(0)
220 }
221 _ => {
222 output.lines().filter(|l| !l.trim().is_empty()).count()
224 }
225 }
226}
227
228async fn extract_from_tab(
229 tab: &dyn BrowserTab,
230 selector: &str,
231 extract: &str,
232 all: bool,
233) -> Result<String, BrowserError> {
234 match extract {
235 "links" => {
236 let js = helpers::js_links_within(selector);
237 let value = tab.evaluate(&js).await?;
238 let links = helpers::parse_link_values(value);
239 let links = if all {
240 links
241 } else {
242 links.into_iter().take(1).collect()
243 };
244 let json_links: Vec<Value> = links
245 .iter()
246 .map(|(t, h)| json!({ "text": t, "href": h }))
247 .collect();
248 Ok(serde_json::to_string_pretty(&json_links).unwrap_or_default())
249 }
250 "elements" => {
251 let js = helpers::js_query_elements(selector);
252 let value = tab.evaluate(&js).await?;
253 let elements = helpers::parse_element_values(value);
254 let elements = if all {
255 elements
256 } else {
257 elements.into_iter().take(1).collect()
258 };
259 let json_elems: Vec<Value> = elements
260 .iter()
261 .map(|(tag, text, attrs)| json!({ "tag": tag, "text": text, "attributes": attrs }))
262 .collect();
263 Ok(serde_json::to_string_pretty(&json_elems).unwrap_or_default())
264 }
265 "markdown" => {
266 let texts = tab.query_all(selector).await?;
267 let texts = if all {
268 texts
269 } else {
270 texts.into_iter().take(1).collect()
271 };
272 Ok(texts.join("\n\n"))
273 }
274 _ => {
275 let texts = tab.query_all(selector).await?;
277 let texts = if all {
278 texts
279 } else {
280 texts.into_iter().take(1).collect()
281 };
282 Ok(texts.join("\n"))
283 }
284 }
285}