bless_plugins/crawl/
mod.rs

1use anyhow::{anyhow, Result};
2use blockless_sdk::{BlessCrawl, CrawlOptions, MapOptions, ScrapeOptions, Viewport};
3use javy_plugin_api::javy::{
4    hold, hold_and_release,
5    quickjs::{prelude::MutFn, Function, Object, Value},
6    to_js_error, Args,
7};
8use std::collections::HashMap;
9use std::sync::Arc;
10
11/// Create a BlessCrawl instance from JavaScript
12pub fn bless_crawl(args: Args<'_>) -> Result<Value<'_>> {
13    let (cx, args) = args.release();
14
15    // optional config can be provided in constructor
16    // Wrap in Arc for sharing across closures
17    let scrape_config = if args.len() > 0 {
18        let config_obj = args[0]
19            .as_object()
20            .ok_or_else(|| anyhow!("config must be an object"))?;
21        Arc::new(Some(parse_options(config_obj)))
22    } else {
23        Arc::new(None)
24    };
25
26    // Create JavaScript object wrapper
27    let instance = Object::new(cx.clone())?;
28
29    // Clone Arc for the scrape closure
30    let scrape_config_clone = scrape_config.clone();
31    instance.set(
32        "scrape",
33        Function::new(
34            cx.clone(),
35            MutFn::new(move |cx, args| {
36                let (cx, args) = hold_and_release!(cx, args);
37
38                let scrape_fn = |args: Args<'_>| {
39                    let (_cx, args) = args.release();
40
41                    if args.is_empty() {
42                        return Err(anyhow!("URL is required"));
43                    }
44
45                    let url = args[0]
46                        .as_string()
47                        .ok_or_else(|| anyhow!("URL must be a string"))?
48                        .to_string()
49                        .map_err(|_| anyhow!("invalid UTF-8 in URL"))?;
50
51                    // Check if args[1] is provided, otherwise use scrape_config, otherwise error
52                    let scrape_options = if args.len() > 1 {
53                        let opts_obj = args[1]
54                            .as_object()
55                            .ok_or_else(|| anyhow!("options must be an object"))?;
56                        parse_options(opts_obj)
57                    } else if let Some(config) = scrape_config_clone.as_ref() {
58                        config.clone()
59                    } else {
60                        return Err(anyhow!(
61                            "No scrape options provided and no default config available"
62                        ));
63                    };
64
65                    let result = BlessCrawl::with_config(scrape_options)
66                        .map_err(|e| anyhow!("Failed to create BlessCrawl instance: {:?}", e))?
67                        .scrape(&url, None)
68                        .map_err(|e| anyhow!("Scrape failed: {:?}", e))?;
69
70                    // Serialize the result to JSON and parse it as a JavaScript value
71                    let json_str = serde_json::to_string(&result)
72                        .map_err(|e| anyhow!("Failed to serialize result: {:?}", e))?;
73                    let js_code = format!("({})", json_str);
74                    let js_value = cx.eval::<Value, _>(js_code.as_bytes()).map_err(|e| {
75                        anyhow!("Failed to parse JSON as JavaScript value: {:?}", e)
76                    })?;
77
78                    Ok(js_value)
79                };
80
81                scrape_fn(hold!(cx.clone(), args)).map_err(|e| to_js_error(cx, e))
82            }),
83        ),
84    )?;
85
86    // Add map method
87    let scrape_config_clone = scrape_config.clone();
88    instance.set(
89        "map",
90        Function::new(
91            cx.clone(),
92            MutFn::new(move |cx, args| {
93                let (cx, args) = hold_and_release!(cx, args);
94
95                let map_fn = |args: Args<'_>| {
96                    let (_cx, args) = args.release();
97
98                    if args.is_empty() {
99                        return Err(anyhow!("URL is required"));
100                    }
101
102                    let url = args[0]
103                        .as_string()
104                        .ok_or_else(|| anyhow!("URL must be a string"))?
105                        .to_string()
106                        .map_err(|_| anyhow!("invalid UTF-8 in URL"))?;
107
108                    let (scrape_options, map_options) = if args.len() > 1 {
109                        let opts_obj = args[1]
110                            .as_object()
111                            .ok_or_else(|| anyhow!("options must be an object"))?;
112                        (parse_options(opts_obj), parse_map_options(opts_obj))
113                    } else if let Some(config) = scrape_config_clone.as_ref() {
114                        (config.clone(), MapOptions::default())
115                    } else {
116                        return Err(anyhow!(
117                            "No scrape options provided and no default config available"
118                        ));
119                    };
120
121                    let result = BlessCrawl::with_config(scrape_options)
122                        .map_err(|e| anyhow!("Failed to create BlessCrawl instance: {:?}", e))?
123                        .map(&url, Some(map_options))
124                        .map_err(|e| anyhow!("Map failed: {:?}", e))?;
125
126                    let json_str = serde_json::to_string(&result)
127                        .map_err(|e| anyhow!("Failed to serialize result: {:?}", e))?;
128
129                    let js_code = format!("({})", json_str);
130                    let js_value = cx.eval::<Value, _>(js_code.as_bytes()).map_err(|e| {
131                        anyhow!("Failed to parse JSON as JavaScript value: {:?}", e)
132                    })?;
133
134                    Ok(js_value)
135                };
136
137                map_fn(hold!(cx.clone(), args)).map_err(|e| to_js_error(cx, e))
138            }),
139        ),
140    )?;
141
142    // Add crawl method
143    let scrape_config_clone = scrape_config.clone();
144    instance.set(
145        "crawl",
146        Function::new(
147            cx.clone(),
148            MutFn::new(move |cx, args| {
149                let (cx, args) = hold_and_release!(cx, args);
150
151                let crawl_fn = |args: Args<'_>| {
152                    let (_cx, args) = args.release();
153
154                    if args.is_empty() {
155                        return Err(anyhow!("URL is required"));
156                    }
157
158                    let url = args[0]
159                        .as_string()
160                        .ok_or_else(|| anyhow!("URL must be a string"))?
161                        .to_string()
162                        .map_err(|_| anyhow!("invalid UTF-8 in URL"))?;
163
164                    let (scrape_options, crawl_options) = if args.len() > 1 {
165                        let opts_obj = args[1]
166                            .as_object()
167                            .ok_or_else(|| anyhow!("options must be an object"))?;
168                        (parse_options(opts_obj), parse_crawl_options(opts_obj))
169                    } else if let Some(config) = scrape_config_clone.as_ref() {
170                        (config.clone(), CrawlOptions::default())
171                    } else {
172                        return Err(anyhow!(
173                            "No scrape options provided and no default config available"
174                        ));
175                    };
176
177                    let result = BlessCrawl::with_config(scrape_options)
178                        .map_err(|e| anyhow!("Failed to create BlessCrawl instance: {:?}", e))?
179                        .crawl(&url, Some(crawl_options))
180                        .map_err(|e| anyhow!("Crawl failed: {:?}", e))?;
181
182                    let json_str = serde_json::to_string(&result)
183                        .map_err(|e| anyhow!("Failed to serialize result: {:?}", e))?;
184
185                    let js_code = format!("({})", json_str);
186                    let js_value = cx.eval::<Value, _>(js_code.as_bytes()).map_err(|e| {
187                        anyhow!("Failed to parse JSON as JavaScript value: {:?}", e)
188                    })?;
189
190                    Ok(js_value)
191                };
192
193                crawl_fn(hold!(cx.clone(), args)).map_err(|e| to_js_error(cx, e))
194            }),
195        ),
196    )?;
197
198    Ok(Value::from_object(instance))
199}
200
201/// Parse JavaScript object into ScrapeOptions
202fn parse_options(obj: &Object) -> ScrapeOptions {
203    // Parse all scrape options from the object
204    let mut scrape_options = ScrapeOptions::default();
205    if let Ok(Some(timeout)) = obj.get::<_, Option<f64>>("timeout") {
206        scrape_options.timeout = timeout as u32;
207    }
208    if let Ok(Some(wait_time)) = obj.get::<_, Option<f64>>("wait_time") {
209        scrape_options.wait_time = wait_time as u32;
210    }
211    if let Ok(Some(include_tags)) = obj.get::<_, Option<Vec<String>>>("include_tags") {
212        scrape_options.include_tags = Some(include_tags);
213    }
214    if let Ok(Some(exclude_tags)) = obj.get::<_, Option<Vec<String>>>("exclude_tags") {
215        scrape_options.exclude_tags = Some(exclude_tags);
216    }
217    if let Ok(Some(only_main_content)) = obj.get::<_, Option<bool>>("only_main_content") {
218        scrape_options.only_main_content = only_main_content;
219    }
220    if let Ok(Some(format)) = obj.get::<_, Option<String>>("format") {
221        scrape_options.format = format.parse().unwrap_or_default();
222    }
223    if let Ok(Some(viewport_obj)) = obj.get::<_, Option<Object>>("viewport") {
224        let mut viewport = Viewport::default();
225        if let Ok(Some(width)) = viewport_obj.get::<_, Option<u32>>("width") {
226            viewport.width = Some(width);
227        }
228        if let Ok(Some(height)) = viewport_obj.get::<_, Option<u32>>("height") {
229            viewport.height = Some(height);
230        }
231        scrape_options.viewport = Some(viewport);
232    }
233    if let Ok(Some(user_agent)) = obj.get::<_, Option<String>>("user_agent") {
234        scrape_options.user_agent = Some(user_agent);
235    }
236    if let Ok(Some(headers_obj)) = obj.get::<_, Option<HashMap<String, String>>>("headers") {
237        scrape_options.headers = Some(headers_obj);
238    }
239    scrape_options
240}
241
242/// Parse JavaScript object into MapOptions
243fn parse_map_options(obj: &Object) -> MapOptions {
244    let mut options = MapOptions::default();
245    if let Ok(Some(base_url)) = obj.get::<_, Option<String>>("base_url") {
246        options.base_url = Some(base_url);
247    }
248    if let Ok(Some(link_types)) = obj.get::<_, Option<Vec<String>>>("link_types") {
249        options.link_types = Some(link_types);
250    }
251    if let Ok(Some(filter_extensions)) = obj.get::<_, Option<Vec<String>>>("filter_extensions") {
252        options.filter_extensions = Some(filter_extensions);
253    }
254    options
255}
256
257/// Parse JavaScript object into CrawlOptions
258fn parse_crawl_options(obj: &Object) -> CrawlOptions {
259    let mut options = CrawlOptions::default();
260    if let Ok(Some(limit)) = obj.get::<_, Option<f64>>("limit") {
261        options.limit = Some(limit as u32);
262    }
263    if let Ok(Some(max_depth)) = obj.get::<_, Option<u8>>("max_depth") {
264        options.max_depth = Some(max_depth);
265    }
266    if let Ok(Some(exclude_paths)) = obj.get::<_, Option<Vec<String>>>("exclude_paths") {
267        options.exclude_paths = Some(exclude_paths);
268    }
269    if let Ok(Some(include_paths)) = obj.get::<_, Option<Vec<String>>>("include_paths") {
270        options.include_paths = Some(include_paths);
271    }
272    if let Ok(Some(follow_external)) = obj.get::<_, Option<bool>>("follow_external") {
273        options.follow_external = Some(follow_external);
274    }
275    if let Ok(Some(delay)) = obj.get::<_, Option<u32>>("delay_between_requests") {
276        options.delay_between_requests = Some(delay);
277    }
278    if let Ok(Some(parallel)) = obj.get::<_, Option<u32>>("parallel_requests") {
279        options.parallel_requests = Some(parallel);
280    }
281    options
282}