1use std::future::Future;
7use std::pin::Pin;
8
9use super::context::MediaToolContext;
10use super::error::invalid_args;
11use super::error::MediaToolError;
12use super::{MediaOp, MediaOpResult};
13
14const MAX_HTML_SIZE: usize = 10 * 1024 * 1024;
16
17const MAX_MATCHES: usize = 1000;
19
20pub struct CssSelectOp;
21
22impl MediaOp for CssSelectOp {
23 fn name(&self) -> &'static str {
24 "css_select"
25 }
26
27 fn description(&self) -> &'static str {
28 "Extract elements from HTML using CSS selectors (returns text or HTML fragments)"
29 }
30
31 fn parameters_schema(&self) -> serde_json::Value {
32 serde_json::json!({
33 "type": "object",
34 "properties": {
35 "hash": {
36 "type": "string",
37 "description": "CAS hash of HTML content (blake3:...)"
38 },
39 "html": {
40 "type": "string",
41 "description": "Raw HTML string to query"
42 },
43 "selector": {
44 "type": "string",
45 "description": "CSS selector (e.g., 'div.product h2', '#main a')"
46 },
47 "output": {
48 "type": "string",
49 "enum": ["text", "html"],
50 "description": "Output mode: 'text' (default) extracts text content, 'html' returns HTML fragments",
51 "default": "text"
52 },
53 "limit": {
54 "type": "integer",
55 "description": "Maximum number of matches to return (default: 1000)",
56 "default": 1000
57 }
58 },
59 "required": ["selector"],
60 "additionalProperties": false
61 })
62 }
63
64 fn execute<'a>(
65 &'a self,
66 args: serde_json::Value,
67 ctx: &'a MediaToolContext,
68 ) -> Pin<Box<dyn Future<Output = Result<MediaOpResult, MediaToolError>> + Send + 'a>> {
69 Box::pin(async move {
70 ctx.check_cancelled()?;
71
72 let selector_str = args
73 .get("selector")
74 .and_then(|v| v.as_str())
75 .ok_or_else(|| invalid_args("css_select", "missing 'selector' parameter"))?
76 .to_string();
77
78 let output_mode = args
79 .get("output")
80 .and_then(|v| v.as_str())
81 .unwrap_or("text")
82 .to_string();
83
84 if output_mode != "text" && output_mode != "html" {
85 return Err(invalid_args(
86 "css_select",
87 format!("invalid output mode '{output_mode}', expected 'text' or 'html'"),
88 ));
89 }
90
91 let limit = args
92 .get("limit")
93 .and_then(|v| v.as_u64())
94 .unwrap_or(MAX_MATCHES as u64)
95 .min(MAX_MATCHES as u64) as usize;
96
97 let html = resolve_html(&args, ctx).await?;
98
99 let matches = ctx
101 .compute
102 .compute(move || -> Result<Vec<String>, MediaToolError> {
103 let document = scraper::Html::parse_document(&html);
104
105 let selector = scraper::Selector::parse(&selector_str).map_err(|e| {
106 invalid_args(
107 "css_select",
108 format!("invalid CSS selector '{selector_str}': {e}"),
109 )
110 })?;
111
112 let results: Vec<String> = document
113 .select(&selector)
114 .take(limit)
115 .map(|el| {
116 if output_mode == "html" {
117 el.html()
118 } else {
119 el.text().collect::<Vec<_>>().join("")
120 }
121 })
122 .collect();
123
124 Ok(results)
125 })
126 .await??;
127
128 let count = matches.len();
129
130 Ok(MediaOpResult::Metadata(serde_json::json!({
131 "matches": matches,
132 "count": count
133 })))
134 })
135 }
136}
137
138async fn resolve_html(
140 args: &serde_json::Value,
141 ctx: &MediaToolContext,
142) -> Result<String, MediaToolError> {
143 if let Some(hash) = args.get("hash").and_then(|v| v.as_str()) {
144 let data = ctx.read_media(hash).await?;
145 if data.len() > MAX_HTML_SIZE {
146 return Err(invalid_args(
147 "css_select",
148 format!(
149 "HTML content too large ({} bytes, max {} bytes)",
150 data.len(),
151 MAX_HTML_SIZE
152 ),
153 ));
154 }
155 String::from_utf8(data).map_err(|_| {
156 invalid_args(
157 "css_select",
158 "CAS content is not valid UTF-8 (expected HTML)",
159 )
160 })
161 } else if let Some(html) = args.get("html").and_then(|v| v.as_str()) {
162 if html.len() > MAX_HTML_SIZE {
163 return Err(invalid_args(
164 "css_select",
165 format!(
166 "HTML string too large ({} bytes, max {} bytes)",
167 html.len(),
168 MAX_HTML_SIZE
169 ),
170 ));
171 }
172 Ok(html.to_string())
173 } else {
174 Err(invalid_args(
175 "css_select",
176 "missing 'hash' or 'html' parameter",
177 ))
178 }
179}
180
181#[cfg(test)]
182mod tests {
183 use super::*;
184 use crate::CasStore;
185 use std::sync::Arc;
186
187 async fn setup() -> (tempfile::TempDir, Arc<MediaToolContext>) {
188 let dir = tempfile::tempdir().unwrap();
189 let ctx = Arc::new(MediaToolContext::new(CasStore::new(dir.path())).unwrap());
190 (dir, ctx)
191 }
192
193 const SAMPLE_HTML: &str = r#"
194 <html>
195 <body>
196 <h1 id="title">Main Title</h1>
197 <div class="product">
198 <h2>Product A</h2>
199 <p class="price">$10</p>
200 </div>
201 <div class="product">
202 <h2>Product B</h2>
203 <p class="price">$20</p>
204 </div>
205 <ul>
206 <li>Item 1</li>
207 <li>Item 2</li>
208 </ul>
209 </body>
210 </html>
211 "#;
212
213 #[tokio::test]
214 async fn select_by_tag() {
215 let (_dir, ctx) = setup().await;
216 let op = CssSelectOp;
217 let result = op
218 .execute(
219 serde_json::json!({"html": SAMPLE_HTML, "selector": "h2"}),
220 &ctx,
221 )
222 .await
223 .unwrap();
224
225 if let MediaOpResult::Metadata(v) = result {
226 let matches = v["matches"].as_array().unwrap();
227 assert_eq!(matches.len(), 2);
228 assert_eq!(matches[0], "Product A");
229 assert_eq!(matches[1], "Product B");
230 assert_eq!(v["count"], 2);
231 } else {
232 panic!("expected Metadata result");
233 }
234 }
235
236 #[tokio::test]
237 async fn select_by_class() {
238 let (_dir, ctx) = setup().await;
239 let op = CssSelectOp;
240 let result = op
241 .execute(
242 serde_json::json!({"html": SAMPLE_HTML, "selector": ".price"}),
243 &ctx,
244 )
245 .await
246 .unwrap();
247
248 if let MediaOpResult::Metadata(v) = result {
249 let matches = v["matches"].as_array().unwrap();
250 assert_eq!(matches.len(), 2);
251 assert_eq!(matches[0], "$10");
252 assert_eq!(matches[1], "$20");
253 } else {
254 panic!("expected Metadata result");
255 }
256 }
257
258 #[tokio::test]
259 async fn select_by_id() {
260 let (_dir, ctx) = setup().await;
261 let op = CssSelectOp;
262 let result = op
263 .execute(
264 serde_json::json!({"html": SAMPLE_HTML, "selector": "#title"}),
265 &ctx,
266 )
267 .await
268 .unwrap();
269
270 if let MediaOpResult::Metadata(v) = result {
271 let matches = v["matches"].as_array().unwrap();
272 assert_eq!(matches.len(), 1);
273 assert_eq!(matches[0], "Main Title");
274 } else {
275 panic!("expected Metadata result");
276 }
277 }
278
279 #[tokio::test]
280 async fn select_nested() {
281 let (_dir, ctx) = setup().await;
282 let op = CssSelectOp;
283 let result = op
284 .execute(
285 serde_json::json!({"html": SAMPLE_HTML, "selector": "div.product h2"}),
286 &ctx,
287 )
288 .await
289 .unwrap();
290
291 if let MediaOpResult::Metadata(v) = result {
292 let matches = v["matches"].as_array().unwrap();
293 assert_eq!(matches.len(), 2);
294 assert_eq!(matches[0], "Product A");
295 } else {
296 panic!("expected Metadata result");
297 }
298 }
299
300 #[tokio::test]
301 async fn select_text_mode() {
302 let (_dir, ctx) = setup().await;
303 let op = CssSelectOp;
304 let result = op
305 .execute(
306 serde_json::json!({
307 "html": SAMPLE_HTML,
308 "selector": ".product",
309 "output": "text"
310 }),
311 &ctx,
312 )
313 .await
314 .unwrap();
315
316 if let MediaOpResult::Metadata(v) = result {
317 let matches = v["matches"].as_array().unwrap();
318 assert_eq!(matches.len(), 2);
319 let text = matches[0].as_str().unwrap();
320 assert!(
321 text.contains("Product A"),
322 "text should contain title: {text}"
323 );
324 assert!(text.contains("$10"), "text should contain price: {text}");
325 } else {
326 panic!("expected Metadata result");
327 }
328 }
329
330 #[tokio::test]
331 async fn select_html_mode() {
332 let (_dir, ctx) = setup().await;
333 let op = CssSelectOp;
334 let result = op
335 .execute(
336 serde_json::json!({
337 "html": SAMPLE_HTML,
338 "selector": "li",
339 "output": "html"
340 }),
341 &ctx,
342 )
343 .await
344 .unwrap();
345
346 if let MediaOpResult::Metadata(v) = result {
347 let matches = v["matches"].as_array().unwrap();
348 assert_eq!(matches.len(), 2);
349 let html = matches[0].as_str().unwrap();
350 assert!(
351 html.contains("<li>"),
352 "html mode should include tags: {html}"
353 );
354 assert!(html.contains("Item 1"), "html should contain text: {html}");
355 } else {
356 panic!("expected Metadata result");
357 }
358 }
359
360 #[tokio::test]
361 async fn select_invalid_selector() {
362 let (_dir, ctx) = setup().await;
363 let op = CssSelectOp;
364 let result = op
365 .execute(
366 serde_json::json!({"html": SAMPLE_HTML, "selector": "!!!invalid"}),
367 &ctx,
368 )
369 .await;
370 assert!(result.is_err());
371 assert!(result.unwrap_err().to_string().contains("NIKA-294"));
372 }
373
374 #[tokio::test]
375 async fn select_no_matches() {
376 let (_dir, ctx) = setup().await;
377 let op = CssSelectOp;
378 let result = op
379 .execute(
380 serde_json::json!({"html": SAMPLE_HTML, "selector": "span.nonexistent"}),
381 &ctx,
382 )
383 .await
384 .unwrap();
385
386 if let MediaOpResult::Metadata(v) = result {
387 let matches = v["matches"].as_array().unwrap();
388 assert!(matches.is_empty());
389 assert_eq!(v["count"], 0);
390 } else {
391 panic!("expected Metadata result");
392 }
393 }
394
395 #[tokio::test]
396 async fn select_missing_selector_param() {
397 let (_dir, ctx) = setup().await;
398 let op = CssSelectOp;
399 let result = op
400 .execute(serde_json::json!({"html": "<p>test</p>"}), &ctx)
401 .await;
402 assert!(result.is_err());
403 assert!(result.unwrap_err().to_string().contains("NIKA-294"));
404 }
405
406 #[tokio::test]
407 async fn select_from_cas_hash() {
408 let (_dir, ctx) = setup().await;
409 let sr = ctx.cas.store(SAMPLE_HTML.as_bytes()).await.unwrap();
410
411 let op = CssSelectOp;
412 let result = op
413 .execute(serde_json::json!({"hash": sr.hash, "selector": "h1"}), &ctx)
414 .await
415 .unwrap();
416
417 if let MediaOpResult::Metadata(v) = result {
418 let matches = v["matches"].as_array().unwrap();
419 assert_eq!(matches.len(), 1);
420 assert_eq!(matches[0], "Main Title");
421 } else {
422 panic!("expected Metadata result");
423 }
424 }
425
426 #[tokio::test]
427 async fn select_cancelled() {
428 let (_dir, ctx) = setup().await;
429 ctx.cancel.cancel();
430 let op = CssSelectOp;
431 let result = op
432 .execute(
433 serde_json::json!({"html": "<p>x</p>", "selector": "p"}),
434 &ctx,
435 )
436 .await;
437 assert!(result.is_err());
438 assert!(result.unwrap_err().to_string().contains("cancelled"));
439 }
440}