Skip to main content

nika_media/tools/
readability.rs

1//! nika:readability — Article content extraction (Mozilla Readability).
2//!
3//! Extracts the main article content from a web page, stripping
4//! navigation, footer, ads, and other non-content elements.
5//! Uses `dom_smoothie` (Rust port of Mozilla's Readability.js).
6
7use std::future::Future;
8use std::pin::Pin;
9
10use super::context::MediaToolContext;
11use super::error::MediaToolError;
12use super::error::{invalid_args, tool_error};
13use super::{MediaOp, MediaOpResult};
14
15/// Maximum HTML input size: 10 MB.
16const MAX_HTML_SIZE: usize = 10 * 1024 * 1024;
17
18pub struct ReadabilityOp;
19
20impl MediaOp for ReadabilityOp {
21    fn name(&self) -> &'static str {
22        "readability"
23    }
24
25    fn description(&self) -> &'static str {
26        "Extract main article content from HTML, stripping nav/footer/ads (Mozilla Readability)"
27    }
28
29    fn parameters_schema(&self) -> serde_json::Value {
30        serde_json::json!({
31          "type": "object",
32          "properties": {
33            "hash": {
34              "type": "string",
35              "description": "CAS hash of HTML content (blake3:...)"
36            },
37            "html": {
38              "type": "string",
39              "description": "Raw HTML string"
40            },
41            "url": {
42              "type": "string",
43              "description": "URL of the page (for resolving relative links)"
44            }
45          },
46          "required": ["hash"],
47          "additionalProperties": false
48        })
49    }
50
51    fn execute<'a>(
52        &'a self,
53        args: serde_json::Value,
54        ctx: &'a MediaToolContext,
55    ) -> Pin<Box<dyn Future<Output = Result<MediaOpResult, MediaToolError>> + Send + 'a>> {
56        Box::pin(async move {
57            ctx.check_cancelled()?;
58
59            let html = resolve_html(&args, ctx).await?;
60            let url = args
61                .get("url")
62                .and_then(|v| v.as_str())
63                .map(|s| s.to_string());
64
65            if html.is_empty() {
66                return Ok(MediaOpResult::Metadata(serde_json::json!({
67                  "title": null,
68                  "content": "",
69                  "text_content": "",
70                  "excerpt": null,
71                  "char_count": 0
72                })));
73            }
74
75            // dom_smoothie is CPU-intensive — run on compute pool
76            let result = ctx
77                .compute
78                .compute(move || -> Result<serde_json::Value, MediaToolError> {
79                    let mut readability =
80                        dom_smoothie::Readability::new(html.as_str(), url.as_deref(), None)
81                            .map_err(|e| {
82                                tool_error("readability", format!("failed to initialize: {e}"))
83                            })?;
84
85                    let article = readability.parse().map_err(|e| {
86                        tool_error("readability", format!("extraction failed: {e}"))
87                    })?;
88
89                    let content_str = article.content.to_string();
90                    let text_content_str = article.text_content.to_string();
91                    let char_count = text_content_str.len();
92
93                    Ok(serde_json::json!({
94                        "title": article.title,
95                        "byline": article.byline,
96                        "content": content_str,
97                        "text_content": text_content_str,
98                        "excerpt": article.excerpt,
99                        "site_name": article.site_name,
100                        "lang": article.lang,
101                        "published_time": article.published_time,
102                        "char_count": char_count,
103                    }))
104                })
105                .await??;
106
107            Ok(MediaOpResult::Metadata(result))
108        })
109    }
110}
111
112/// Resolve HTML content from either a CAS hash or raw HTML string.
113async fn resolve_html(
114    args: &serde_json::Value,
115    ctx: &MediaToolContext,
116) -> Result<String, MediaToolError> {
117    if let Some(hash) = args.get("hash").and_then(|v| v.as_str()) {
118        let data = ctx.read_media(hash).await?;
119        if data.len() > MAX_HTML_SIZE {
120            return Err(invalid_args(
121                "readability",
122                format!(
123                    "HTML content too large ({} bytes, max {} bytes)",
124                    data.len(),
125                    MAX_HTML_SIZE
126                ),
127            ));
128        }
129        String::from_utf8(data).map_err(|_| {
130            invalid_args(
131                "readability",
132                "CAS content is not valid UTF-8 (expected HTML)",
133            )
134        })
135    } else if let Some(html) = args.get("html").and_then(|v| v.as_str()) {
136        if html.len() > MAX_HTML_SIZE {
137            return Err(invalid_args(
138                "readability",
139                format!(
140                    "HTML string too large ({} bytes, max {} bytes)",
141                    html.len(),
142                    MAX_HTML_SIZE
143                ),
144            ));
145        }
146        Ok(html.to_string())
147    } else {
148        Err(invalid_args(
149            "readability",
150            "missing 'hash' or 'html' parameter",
151        ))
152    }
153}
154
155#[cfg(test)]
156mod tests {
157    use super::*;
158    use crate::CasStore;
159    use std::sync::Arc;
160
161    async fn setup() -> (tempfile::TempDir, Arc<MediaToolContext>) {
162        let dir = tempfile::tempdir().unwrap();
163        let ctx = Arc::new(MediaToolContext::new(CasStore::new(dir.path())).unwrap());
164        (dir, ctx)
165    }
166
167    /// A realistic article HTML for testing.
168    const ARTICLE_HTML: &str = r#"
169        <!DOCTYPE html>
170        <html lang="en">
171        <head>
172            <title>The Future of Rust - A Deep Dive</title>
173            <meta name="author" content="Alice Smith">
174            <meta name="description" content="An in-depth look at Rust's future">
175        </head>
176        <body>
177            <nav>
178                <a href="/">Home</a>
179                <a href="/blog">Blog</a>
180            </nav>
181            <article>
182                <h1>The Future of Rust</h1>
183                <p>Rust has become one of the most loved programming languages in the world.
184                   Its focus on safety, performance, and concurrency makes it ideal for systems
185                   programming, web development, and more. In this article, we explore what
186                   the future holds for the Rust ecosystem.</p>
187                <p>The Rust community has been growing steadily. With the introduction of
188                   async/await, the language has become more accessible for network programming.
189                   The borrow checker, once seen as a barrier, is now appreciated as a powerful
190                   tool for preventing bugs at compile time.</p>
191                <p>Looking ahead, improvements to compile times, better IDE support, and
192                   expanding the standard library are key priorities. The Rust Foundation
193                   continues to invest in the language's infrastructure and community.</p>
194                <p>Many companies including Mozilla, Microsoft, Google, and Amazon are now
195                   using Rust in production. The language's adoption in safety-critical systems,
196                   embedded development, and WebAssembly is accelerating.</p>
197                <p>In conclusion, Rust's future looks bright. The combination of performance,
198                   safety, and a thriving community ensures that Rust will continue to grow
199                   and evolve for years to come.</p>
200            </article>
201            <footer>
202                <p>Copyright 2026 Example Corp</p>
203                <a href="/privacy">Privacy Policy</a>
204            </footer>
205        </body>
206        </html>
207    "#;
208
209    #[tokio::test]
210    async fn extract_article_content() {
211        let (_dir, ctx) = setup().await;
212        let op = ReadabilityOp;
213        let result = op
214            .execute(serde_json::json!({"html": ARTICLE_HTML}), &ctx)
215            .await
216            .unwrap();
217
218        if let MediaOpResult::Metadata(v) = result {
219            let text = v["text_content"].as_str().unwrap();
220            assert!(text.contains("Rust"), "should extract article text: {text}");
221            assert!(
222                v["char_count"].as_u64().unwrap() > 100,
223                "should have substantial content"
224            );
225        } else {
226            panic!("expected Metadata result");
227        }
228    }
229
230    #[tokio::test]
231    async fn extract_title() {
232        let (_dir, ctx) = setup().await;
233        let op = ReadabilityOp;
234        let result = op
235            .execute(serde_json::json!({"html": ARTICLE_HTML}), &ctx)
236            .await
237            .unwrap();
238
239        if let MediaOpResult::Metadata(v) = result {
240            let title = v["title"].as_str().unwrap();
241            assert!(
242                title.contains("Rust"),
243                "should extract article title: {title}"
244            );
245        } else {
246            panic!("expected Metadata result");
247        }
248    }
249
250    #[tokio::test]
251    async fn strips_navigation() {
252        let (_dir, ctx) = setup().await;
253        let op = ReadabilityOp;
254        let result = op
255            .execute(serde_json::json!({"html": ARTICLE_HTML}), &ctx)
256            .await
257            .unwrap();
258
259        if let MediaOpResult::Metadata(v) = result {
260            let content = v["content"].as_str().unwrap();
261            // Nav links should not appear in extracted content
262            assert!(
263                !content.contains("Privacy Policy"),
264                "should strip footer: {content}"
265            );
266        } else {
267            panic!("expected Metadata result");
268        }
269    }
270
271    #[tokio::test]
272    async fn extract_from_cas_hash() {
273        let (_dir, ctx) = setup().await;
274        let sr = ctx.cas.store(ARTICLE_HTML.as_bytes()).await.unwrap();
275
276        let op = ReadabilityOp;
277        let result = op
278            .execute(serde_json::json!({"hash": sr.hash}), &ctx)
279            .await
280            .unwrap();
281
282        if let MediaOpResult::Metadata(v) = result {
283            assert!(v["char_count"].as_u64().unwrap() > 0);
284        } else {
285            panic!("expected Metadata result");
286        }
287    }
288
289    #[tokio::test]
290    async fn extract_with_url() {
291        let (_dir, ctx) = setup().await;
292        let op = ReadabilityOp;
293        let result = op
294            .execute(
295                serde_json::json!({
296                    "html": ARTICLE_HTML,
297                    "url": "https://example.com/article"
298                }),
299                &ctx,
300            )
301            .await
302            .unwrap();
303
304        if let MediaOpResult::Metadata(v) = result {
305            assert!(
306                v["char_count"].as_u64().unwrap() > 0,
307                "should extract content with URL context"
308            );
309        } else {
310            panic!("expected Metadata result");
311        }
312    }
313
314    #[tokio::test]
315    async fn extract_empty_html() {
316        let (_dir, ctx) = setup().await;
317        let op = ReadabilityOp;
318        let result = op
319            .execute(serde_json::json!({"html": ""}), &ctx)
320            .await
321            .unwrap();
322
323        if let MediaOpResult::Metadata(v) = result {
324            assert_eq!(v["char_count"], 0);
325            assert_eq!(v["content"], "");
326        } else {
327            panic!("expected Metadata result");
328        }
329    }
330
331    #[tokio::test]
332    async fn extract_missing_params() {
333        let (_dir, ctx) = setup().await;
334        let op = ReadabilityOp;
335        let result = op.execute(serde_json::json!({}), &ctx).await;
336        assert!(result.is_err());
337        assert!(result.unwrap_err().to_string().contains("NIKA-294"));
338    }
339
340    #[tokio::test]
341    async fn extract_cancelled() {
342        let (_dir, ctx) = setup().await;
343        ctx.cancel.cancel();
344        let op = ReadabilityOp;
345        let result = op
346            .execute(serde_json::json!({"html": ARTICLE_HTML}), &ctx)
347            .await;
348        assert!(result.is_err());
349        assert!(result.unwrap_err().to_string().contains("cancelled"));
350    }
351}