Skip to main content

nika_media/tools/
css_select.rs

1//! nika:css_select — CSS selector extraction from HTML content.
2//!
3//! Accepts a CAS hash or raw HTML string and a CSS selector.
4//! Returns matching elements as text or HTML fragments.
5
6use std::future::Future;
7use std::pin::Pin;
8
9use super::context::MediaToolContext;
10use super::error::invalid_args;
11use super::error::MediaToolError;
12use super::{MediaOp, MediaOpResult};
13
14/// Maximum HTML input size: 10 MB.
15const MAX_HTML_SIZE: usize = 10 * 1024 * 1024;
16
17/// Maximum number of matches to return (prevents unbounded output).
18const MAX_MATCHES: usize = 1000;
19
20pub struct CssSelectOp;
21
22impl MediaOp for CssSelectOp {
23    fn name(&self) -> &'static str {
24        "css_select"
25    }
26
27    fn description(&self) -> &'static str {
28        "Extract elements from HTML using CSS selectors (returns text or HTML fragments)"
29    }
30
31    fn parameters_schema(&self) -> serde_json::Value {
32        serde_json::json!({
33          "type": "object",
34          "properties": {
35            "hash": {
36              "type": "string",
37              "description": "CAS hash of HTML content (blake3:...)"
38            },
39            "html": {
40              "type": "string",
41              "description": "Raw HTML string to query"
42            },
43            "selector": {
44              "type": "string",
45              "description": "CSS selector (e.g., 'div.product h2', '#main a')"
46            },
47            "output": {
48              "type": "string",
49              "enum": ["text", "html"],
50              "description": "Output mode: 'text' (default) extracts text content, 'html' returns HTML fragments",
51              "default": "text"
52            },
53            "limit": {
54              "type": "integer",
55              "description": "Maximum number of matches to return (default: 1000)",
56              "default": 1000
57            }
58          },
59          "required": ["selector"],
60          "additionalProperties": false
61        })
62    }
63
64    fn execute<'a>(
65        &'a self,
66        args: serde_json::Value,
67        ctx: &'a MediaToolContext,
68    ) -> Pin<Box<dyn Future<Output = Result<MediaOpResult, MediaToolError>> + Send + 'a>> {
69        Box::pin(async move {
70            ctx.check_cancelled()?;
71
72            let selector_str = args
73                .get("selector")
74                .and_then(|v| v.as_str())
75                .ok_or_else(|| invalid_args("css_select", "missing 'selector' parameter"))?
76                .to_string();
77
78            let output_mode = args
79                .get("output")
80                .and_then(|v| v.as_str())
81                .unwrap_or("text")
82                .to_string();
83
84            if output_mode != "text" && output_mode != "html" {
85                return Err(invalid_args(
86                    "css_select",
87                    format!("invalid output mode '{output_mode}', expected 'text' or 'html'"),
88                ));
89            }
90
91            let limit = args
92                .get("limit")
93                .and_then(|v| v.as_u64())
94                .unwrap_or(MAX_MATCHES as u64)
95                .min(MAX_MATCHES as u64) as usize;
96
97            let html = resolve_html(&args, ctx).await?;
98
99            // Parse and select on the compute pool (can be CPU-intensive)
100            let matches = ctx
101                .compute
102                .compute(move || -> Result<Vec<String>, MediaToolError> {
103                    let document = scraper::Html::parse_document(&html);
104
105                    let selector = scraper::Selector::parse(&selector_str).map_err(|e| {
106                        invalid_args(
107                            "css_select",
108                            format!("invalid CSS selector '{selector_str}': {e}"),
109                        )
110                    })?;
111
112                    let results: Vec<String> = document
113                        .select(&selector)
114                        .take(limit)
115                        .map(|el| {
116                            if output_mode == "html" {
117                                el.html()
118                            } else {
119                                el.text().collect::<Vec<_>>().join("")
120                            }
121                        })
122                        .collect();
123
124                    Ok(results)
125                })
126                .await??;
127
128            let count = matches.len();
129
130            Ok(MediaOpResult::Metadata(serde_json::json!({
131              "matches": matches,
132              "count": count
133            })))
134        })
135    }
136}
137
138/// Resolve HTML content from either a CAS hash or raw HTML string.
139async fn resolve_html(
140    args: &serde_json::Value,
141    ctx: &MediaToolContext,
142) -> Result<String, MediaToolError> {
143    if let Some(hash) = args.get("hash").and_then(|v| v.as_str()) {
144        let data = ctx.read_media(hash).await?;
145        if data.len() > MAX_HTML_SIZE {
146            return Err(invalid_args(
147                "css_select",
148                format!(
149                    "HTML content too large ({} bytes, max {} bytes)",
150                    data.len(),
151                    MAX_HTML_SIZE
152                ),
153            ));
154        }
155        String::from_utf8(data).map_err(|_| {
156            invalid_args(
157                "css_select",
158                "CAS content is not valid UTF-8 (expected HTML)",
159            )
160        })
161    } else if let Some(html) = args.get("html").and_then(|v| v.as_str()) {
162        if html.len() > MAX_HTML_SIZE {
163            return Err(invalid_args(
164                "css_select",
165                format!(
166                    "HTML string too large ({} bytes, max {} bytes)",
167                    html.len(),
168                    MAX_HTML_SIZE
169                ),
170            ));
171        }
172        Ok(html.to_string())
173    } else {
174        Err(invalid_args(
175            "css_select",
176            "missing 'hash' or 'html' parameter",
177        ))
178    }
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184    use crate::CasStore;
185    use std::sync::Arc;
186
187    async fn setup() -> (tempfile::TempDir, Arc<MediaToolContext>) {
188        let dir = tempfile::tempdir().unwrap();
189        let ctx = Arc::new(MediaToolContext::new(CasStore::new(dir.path())).unwrap());
190        (dir, ctx)
191    }
192
193    const SAMPLE_HTML: &str = r#"
194        <html>
195        <body>
196            <h1 id="title">Main Title</h1>
197            <div class="product">
198                <h2>Product A</h2>
199                <p class="price">$10</p>
200            </div>
201            <div class="product">
202                <h2>Product B</h2>
203                <p class="price">$20</p>
204            </div>
205            <ul>
206                <li>Item 1</li>
207                <li>Item 2</li>
208            </ul>
209        </body>
210        </html>
211    "#;
212
213    #[tokio::test]
214    async fn select_by_tag() {
215        let (_dir, ctx) = setup().await;
216        let op = CssSelectOp;
217        let result = op
218            .execute(
219                serde_json::json!({"html": SAMPLE_HTML, "selector": "h2"}),
220                &ctx,
221            )
222            .await
223            .unwrap();
224
225        if let MediaOpResult::Metadata(v) = result {
226            let matches = v["matches"].as_array().unwrap();
227            assert_eq!(matches.len(), 2);
228            assert_eq!(matches[0], "Product A");
229            assert_eq!(matches[1], "Product B");
230            assert_eq!(v["count"], 2);
231        } else {
232            panic!("expected Metadata result");
233        }
234    }
235
236    #[tokio::test]
237    async fn select_by_class() {
238        let (_dir, ctx) = setup().await;
239        let op = CssSelectOp;
240        let result = op
241            .execute(
242                serde_json::json!({"html": SAMPLE_HTML, "selector": ".price"}),
243                &ctx,
244            )
245            .await
246            .unwrap();
247
248        if let MediaOpResult::Metadata(v) = result {
249            let matches = v["matches"].as_array().unwrap();
250            assert_eq!(matches.len(), 2);
251            assert_eq!(matches[0], "$10");
252            assert_eq!(matches[1], "$20");
253        } else {
254            panic!("expected Metadata result");
255        }
256    }
257
258    #[tokio::test]
259    async fn select_by_id() {
260        let (_dir, ctx) = setup().await;
261        let op = CssSelectOp;
262        let result = op
263            .execute(
264                serde_json::json!({"html": SAMPLE_HTML, "selector": "#title"}),
265                &ctx,
266            )
267            .await
268            .unwrap();
269
270        if let MediaOpResult::Metadata(v) = result {
271            let matches = v["matches"].as_array().unwrap();
272            assert_eq!(matches.len(), 1);
273            assert_eq!(matches[0], "Main Title");
274        } else {
275            panic!("expected Metadata result");
276        }
277    }
278
279    #[tokio::test]
280    async fn select_nested() {
281        let (_dir, ctx) = setup().await;
282        let op = CssSelectOp;
283        let result = op
284            .execute(
285                serde_json::json!({"html": SAMPLE_HTML, "selector": "div.product h2"}),
286                &ctx,
287            )
288            .await
289            .unwrap();
290
291        if let MediaOpResult::Metadata(v) = result {
292            let matches = v["matches"].as_array().unwrap();
293            assert_eq!(matches.len(), 2);
294            assert_eq!(matches[0], "Product A");
295        } else {
296            panic!("expected Metadata result");
297        }
298    }
299
300    #[tokio::test]
301    async fn select_text_mode() {
302        let (_dir, ctx) = setup().await;
303        let op = CssSelectOp;
304        let result = op
305            .execute(
306                serde_json::json!({
307                    "html": SAMPLE_HTML,
308                    "selector": ".product",
309                    "output": "text"
310                }),
311                &ctx,
312            )
313            .await
314            .unwrap();
315
316        if let MediaOpResult::Metadata(v) = result {
317            let matches = v["matches"].as_array().unwrap();
318            assert_eq!(matches.len(), 2);
319            let text = matches[0].as_str().unwrap();
320            assert!(
321                text.contains("Product A"),
322                "text should contain title: {text}"
323            );
324            assert!(text.contains("$10"), "text should contain price: {text}");
325        } else {
326            panic!("expected Metadata result");
327        }
328    }
329
330    #[tokio::test]
331    async fn select_html_mode() {
332        let (_dir, ctx) = setup().await;
333        let op = CssSelectOp;
334        let result = op
335            .execute(
336                serde_json::json!({
337                    "html": SAMPLE_HTML,
338                    "selector": "li",
339                    "output": "html"
340                }),
341                &ctx,
342            )
343            .await
344            .unwrap();
345
346        if let MediaOpResult::Metadata(v) = result {
347            let matches = v["matches"].as_array().unwrap();
348            assert_eq!(matches.len(), 2);
349            let html = matches[0].as_str().unwrap();
350            assert!(
351                html.contains("<li>"),
352                "html mode should include tags: {html}"
353            );
354            assert!(html.contains("Item 1"), "html should contain text: {html}");
355        } else {
356            panic!("expected Metadata result");
357        }
358    }
359
360    #[tokio::test]
361    async fn select_invalid_selector() {
362        let (_dir, ctx) = setup().await;
363        let op = CssSelectOp;
364        let result = op
365            .execute(
366                serde_json::json!({"html": SAMPLE_HTML, "selector": "!!!invalid"}),
367                &ctx,
368            )
369            .await;
370        assert!(result.is_err());
371        assert!(result.unwrap_err().to_string().contains("NIKA-294"));
372    }
373
374    #[tokio::test]
375    async fn select_no_matches() {
376        let (_dir, ctx) = setup().await;
377        let op = CssSelectOp;
378        let result = op
379            .execute(
380                serde_json::json!({"html": SAMPLE_HTML, "selector": "span.nonexistent"}),
381                &ctx,
382            )
383            .await
384            .unwrap();
385
386        if let MediaOpResult::Metadata(v) = result {
387            let matches = v["matches"].as_array().unwrap();
388            assert!(matches.is_empty());
389            assert_eq!(v["count"], 0);
390        } else {
391            panic!("expected Metadata result");
392        }
393    }
394
395    #[tokio::test]
396    async fn select_missing_selector_param() {
397        let (_dir, ctx) = setup().await;
398        let op = CssSelectOp;
399        let result = op
400            .execute(serde_json::json!({"html": "<p>test</p>"}), &ctx)
401            .await;
402        assert!(result.is_err());
403        assert!(result.unwrap_err().to_string().contains("NIKA-294"));
404    }
405
406    #[tokio::test]
407    async fn select_from_cas_hash() {
408        let (_dir, ctx) = setup().await;
409        let sr = ctx.cas.store(SAMPLE_HTML.as_bytes()).await.unwrap();
410
411        let op = CssSelectOp;
412        let result = op
413            .execute(serde_json::json!({"hash": sr.hash, "selector": "h1"}), &ctx)
414            .await
415            .unwrap();
416
417        if let MediaOpResult::Metadata(v) = result {
418            let matches = v["matches"].as_array().unwrap();
419            assert_eq!(matches.len(), 1);
420            assert_eq!(matches[0], "Main Title");
421        } else {
422            panic!("expected Metadata result");
423        }
424    }
425
426    #[tokio::test]
427    async fn select_cancelled() {
428        let (_dir, ctx) = setup().await;
429        ctx.cancel.cancel();
430        let op = CssSelectOp;
431        let result = op
432            .execute(
433                serde_json::json!({"html": "<p>x</p>", "selector": "p"}),
434                &ctx,
435            )
436            .await;
437        assert!(result.is_err());
438        assert!(result.unwrap_err().to_string().contains("cancelled"));
439    }
440}