1use std::future::Future;
8use std::pin::Pin;
9
10use super::context::MediaToolContext;
11use super::error::MediaToolError;
12use super::error::{invalid_args, tool_error};
13use super::{MediaOp, MediaOpResult};
14
15const MAX_HTML_SIZE: usize = 10 * 1024 * 1024;
17
18pub struct ReadabilityOp;
19
20impl MediaOp for ReadabilityOp {
21 fn name(&self) -> &'static str {
22 "readability"
23 }
24
25 fn description(&self) -> &'static str {
26 "Extract main article content from HTML, stripping nav/footer/ads (Mozilla Readability)"
27 }
28
29 fn parameters_schema(&self) -> serde_json::Value {
30 serde_json::json!({
31 "type": "object",
32 "properties": {
33 "hash": {
34 "type": "string",
35 "description": "CAS hash of HTML content (blake3:...)"
36 },
37 "html": {
38 "type": "string",
39 "description": "Raw HTML string"
40 },
41 "url": {
42 "type": "string",
43 "description": "URL of the page (for resolving relative links)"
44 }
45 },
46 "required": ["hash"],
47 "additionalProperties": false
48 })
49 }
50
51 fn execute<'a>(
52 &'a self,
53 args: serde_json::Value,
54 ctx: &'a MediaToolContext,
55 ) -> Pin<Box<dyn Future<Output = Result<MediaOpResult, MediaToolError>> + Send + 'a>> {
56 Box::pin(async move {
57 ctx.check_cancelled()?;
58
59 let html = resolve_html(&args, ctx).await?;
60 let url = args
61 .get("url")
62 .and_then(|v| v.as_str())
63 .map(|s| s.to_string());
64
65 if html.is_empty() {
66 return Ok(MediaOpResult::Metadata(serde_json::json!({
67 "title": null,
68 "content": "",
69 "text_content": "",
70 "excerpt": null,
71 "char_count": 0
72 })));
73 }
74
75 let result = ctx
77 .compute
78 .compute(move || -> Result<serde_json::Value, MediaToolError> {
79 let mut readability =
80 dom_smoothie::Readability::new(html.as_str(), url.as_deref(), None)
81 .map_err(|e| {
82 tool_error("readability", format!("failed to initialize: {e}"))
83 })?;
84
85 let article = readability.parse().map_err(|e| {
86 tool_error("readability", format!("extraction failed: {e}"))
87 })?;
88
89 let content_str = article.content.to_string();
90 let text_content_str = article.text_content.to_string();
91 let char_count = text_content_str.len();
92
93 Ok(serde_json::json!({
94 "title": article.title,
95 "byline": article.byline,
96 "content": content_str,
97 "text_content": text_content_str,
98 "excerpt": article.excerpt,
99 "site_name": article.site_name,
100 "lang": article.lang,
101 "published_time": article.published_time,
102 "char_count": char_count,
103 }))
104 })
105 .await??;
106
107 Ok(MediaOpResult::Metadata(result))
108 })
109 }
110}
111
112async fn resolve_html(
114 args: &serde_json::Value,
115 ctx: &MediaToolContext,
116) -> Result<String, MediaToolError> {
117 if let Some(hash) = args.get("hash").and_then(|v| v.as_str()) {
118 let data = ctx.read_media(hash).await?;
119 if data.len() > MAX_HTML_SIZE {
120 return Err(invalid_args(
121 "readability",
122 format!(
123 "HTML content too large ({} bytes, max {} bytes)",
124 data.len(),
125 MAX_HTML_SIZE
126 ),
127 ));
128 }
129 String::from_utf8(data).map_err(|_| {
130 invalid_args(
131 "readability",
132 "CAS content is not valid UTF-8 (expected HTML)",
133 )
134 })
135 } else if let Some(html) = args.get("html").and_then(|v| v.as_str()) {
136 if html.len() > MAX_HTML_SIZE {
137 return Err(invalid_args(
138 "readability",
139 format!(
140 "HTML string too large ({} bytes, max {} bytes)",
141 html.len(),
142 MAX_HTML_SIZE
143 ),
144 ));
145 }
146 Ok(html.to_string())
147 } else {
148 Err(invalid_args(
149 "readability",
150 "missing 'hash' or 'html' parameter",
151 ))
152 }
153}
154
155#[cfg(test)]
156mod tests {
157 use super::*;
158 use crate::CasStore;
159 use std::sync::Arc;
160
161 async fn setup() -> (tempfile::TempDir, Arc<MediaToolContext>) {
162 let dir = tempfile::tempdir().unwrap();
163 let ctx = Arc::new(MediaToolContext::new(CasStore::new(dir.path())).unwrap());
164 (dir, ctx)
165 }
166
167 const ARTICLE_HTML: &str = r#"
169 <!DOCTYPE html>
170 <html lang="en">
171 <head>
172 <title>The Future of Rust - A Deep Dive</title>
173 <meta name="author" content="Alice Smith">
174 <meta name="description" content="An in-depth look at Rust's future">
175 </head>
176 <body>
177 <nav>
178 <a href="/">Home</a>
179 <a href="/blog">Blog</a>
180 </nav>
181 <article>
182 <h1>The Future of Rust</h1>
183 <p>Rust has become one of the most loved programming languages in the world.
184 Its focus on safety, performance, and concurrency makes it ideal for systems
185 programming, web development, and more. In this article, we explore what
186 the future holds for the Rust ecosystem.</p>
187 <p>The Rust community has been growing steadily. With the introduction of
188 async/await, the language has become more accessible for network programming.
189 The borrow checker, once seen as a barrier, is now appreciated as a powerful
190 tool for preventing bugs at compile time.</p>
191 <p>Looking ahead, improvements to compile times, better IDE support, and
192 expanding the standard library are key priorities. The Rust Foundation
193 continues to invest in the language's infrastructure and community.</p>
194 <p>Many companies including Mozilla, Microsoft, Google, and Amazon are now
195 using Rust in production. The language's adoption in safety-critical systems,
196 embedded development, and WebAssembly is accelerating.</p>
197 <p>In conclusion, Rust's future looks bright. The combination of performance,
198 safety, and a thriving community ensures that Rust will continue to grow
199 and evolve for years to come.</p>
200 </article>
201 <footer>
202 <p>Copyright 2026 Example Corp</p>
203 <a href="/privacy">Privacy Policy</a>
204 </footer>
205 </body>
206 </html>
207 "#;
208
209 #[tokio::test]
210 async fn extract_article_content() {
211 let (_dir, ctx) = setup().await;
212 let op = ReadabilityOp;
213 let result = op
214 .execute(serde_json::json!({"html": ARTICLE_HTML}), &ctx)
215 .await
216 .unwrap();
217
218 if let MediaOpResult::Metadata(v) = result {
219 let text = v["text_content"].as_str().unwrap();
220 assert!(text.contains("Rust"), "should extract article text: {text}");
221 assert!(
222 v["char_count"].as_u64().unwrap() > 100,
223 "should have substantial content"
224 );
225 } else {
226 panic!("expected Metadata result");
227 }
228 }
229
230 #[tokio::test]
231 async fn extract_title() {
232 let (_dir, ctx) = setup().await;
233 let op = ReadabilityOp;
234 let result = op
235 .execute(serde_json::json!({"html": ARTICLE_HTML}), &ctx)
236 .await
237 .unwrap();
238
239 if let MediaOpResult::Metadata(v) = result {
240 let title = v["title"].as_str().unwrap();
241 assert!(
242 title.contains("Rust"),
243 "should extract article title: {title}"
244 );
245 } else {
246 panic!("expected Metadata result");
247 }
248 }
249
250 #[tokio::test]
251 async fn strips_navigation() {
252 let (_dir, ctx) = setup().await;
253 let op = ReadabilityOp;
254 let result = op
255 .execute(serde_json::json!({"html": ARTICLE_HTML}), &ctx)
256 .await
257 .unwrap();
258
259 if let MediaOpResult::Metadata(v) = result {
260 let content = v["content"].as_str().unwrap();
261 assert!(
263 !content.contains("Privacy Policy"),
264 "should strip footer: {content}"
265 );
266 } else {
267 panic!("expected Metadata result");
268 }
269 }
270
271 #[tokio::test]
272 async fn extract_from_cas_hash() {
273 let (_dir, ctx) = setup().await;
274 let sr = ctx.cas.store(ARTICLE_HTML.as_bytes()).await.unwrap();
275
276 let op = ReadabilityOp;
277 let result = op
278 .execute(serde_json::json!({"hash": sr.hash}), &ctx)
279 .await
280 .unwrap();
281
282 if let MediaOpResult::Metadata(v) = result {
283 assert!(v["char_count"].as_u64().unwrap() > 0);
284 } else {
285 panic!("expected Metadata result");
286 }
287 }
288
289 #[tokio::test]
290 async fn extract_with_url() {
291 let (_dir, ctx) = setup().await;
292 let op = ReadabilityOp;
293 let result = op
294 .execute(
295 serde_json::json!({
296 "html": ARTICLE_HTML,
297 "url": "https://example.com/article"
298 }),
299 &ctx,
300 )
301 .await
302 .unwrap();
303
304 if let MediaOpResult::Metadata(v) = result {
305 assert!(
306 v["char_count"].as_u64().unwrap() > 0,
307 "should extract content with URL context"
308 );
309 } else {
310 panic!("expected Metadata result");
311 }
312 }
313
314 #[tokio::test]
315 async fn extract_empty_html() {
316 let (_dir, ctx) = setup().await;
317 let op = ReadabilityOp;
318 let result = op
319 .execute(serde_json::json!({"html": ""}), &ctx)
320 .await
321 .unwrap();
322
323 if let MediaOpResult::Metadata(v) = result {
324 assert_eq!(v["char_count"], 0);
325 assert_eq!(v["content"], "");
326 } else {
327 panic!("expected Metadata result");
328 }
329 }
330
331 #[tokio::test]
332 async fn extract_missing_params() {
333 let (_dir, ctx) = setup().await;
334 let op = ReadabilityOp;
335 let result = op.execute(serde_json::json!({}), &ctx).await;
336 assert!(result.is_err());
337 assert!(result.unwrap_err().to_string().contains("NIKA-294"));
338 }
339
340 #[tokio::test]
341 async fn extract_cancelled() {
342 let (_dir, ctx) = setup().await;
343 ctx.cancel.cancel();
344 let op = ReadabilityOp;
345 let result = op
346 .execute(serde_json::json!({"html": ARTICLE_HTML}), &ctx)
347 .await;
348 assert!(result.is_err());
349 assert!(result.unwrap_err().to_string().contains("cancelled"));
350 }
351}