1use scraper::{Html, Selector};
2
3pub struct RenderingDetector;
5
6#[derive(Debug, Clone)]
8pub struct DetectionResult {
9 pub needs_js: bool,
11 pub reason: String,
13 pub detected_frameworks: Vec<String>,
15 pub content_script_ratio: f64,
17}
18
19#[derive(Debug)]
21struct FrameworkSignature {
22 name: &'static str,
23 html_markers: Vec<&'static str>,
24 script_patterns: Vec<&'static str>,
25}
26
27impl RenderingDetector {
28 pub fn needs_javascript(html: &str, _url: &str) -> DetectionResult {
30 let document = Html::parse_document(html);
31 let mut detected_frameworks = Vec::new();
32 let mut reasons = Vec::new();
33
34 let frameworks = Self::get_framework_signatures();
36 for framework in frameworks {
37 if Self::detect_framework(&document, html, &framework) {
38 detected_frameworks.push(framework.name.to_string());
39 reasons.push(format!("{} framework detected", framework.name));
40 }
41 }
42
43 if Self::has_lazy_loading_indicators(&document, html) {
45 reasons.push("Lazy loading detected".to_string());
46 }
47
48 if Self::has_spa_routing(&document, html) {
50 reasons.push("SPA routing detected".to_string());
51 }
52
53 let content_script_ratio = Self::calculate_content_script_ratio(&document, html);
55 if content_script_ratio < 0.5 {
56 reasons.push(format!(
57 "Low content-to-script ratio: {:.2}",
58 content_script_ratio
59 ));
60 }
61
62 if Self::has_minimal_content(&document) {
64 reasons.push("Minimal initial content (SPA shell)".to_string());
65 }
66
67 if Self::has_hydration_markers(html) {
69 reasons.push("Hydration markers detected".to_string());
70 }
71
72 let needs_js = !reasons.is_empty() || !detected_frameworks.is_empty();
73 let reason = if needs_js {
74 reasons.join("; ")
75 } else {
76 "Static content with sufficient initial HTML".to_string()
77 };
78
79 DetectionResult {
80 needs_js,
81 reason,
82 detected_frameworks,
83 content_script_ratio,
84 }
85 }
86
87 fn get_framework_signatures() -> Vec<FrameworkSignature> {
89 vec![
90 FrameworkSignature {
91 name: "React",
92 html_markers: vec![
93 "__REACT_DEVTOOLS_GLOBAL_HOOK__",
94 "data-reactroot",
95 "data-react-helmet",
96 "react-root",
97 ],
98 script_patterns: vec!["react", "react-dom"],
99 },
100 FrameworkSignature {
101 name: "Next.js",
102 html_markers: vec!["__NEXT_DATA__", "_N_E", "__next"],
103 script_patterns: vec!["_next/static", "next/dist"],
104 },
105 FrameworkSignature {
106 name: "Vue",
107 html_markers: vec!["data-v-", "__VUE__", "data-server-rendered"],
108 script_patterns: vec!["vue.js", "vue.runtime"],
109 },
110 FrameworkSignature {
111 name: "Nuxt",
112 html_markers: vec!["__NUXT__", "$nuxt", "nuxt-link"],
113 script_patterns: vec!["_nuxt/"],
114 },
115 FrameworkSignature {
116 name: "Angular",
117 html_markers: vec!["ng-version", "_nghost", "_ngcontent"],
118 script_patterns: vec!["angular", "@angular"],
119 },
120 FrameworkSignature {
121 name: "Svelte",
122 html_markers: vec!["svelte-"],
123 script_patterns: vec!["svelte"],
124 },
125 FrameworkSignature {
126 name: "Gatsby",
127 html_markers: vec!["___gatsby", "gatsby-"],
128 script_patterns: vec!["webpack-runtime"],
129 },
130 FrameworkSignature {
131 name: "Ember",
132 html_markers: vec!["ember-application", "ember-view"],
133 script_patterns: vec!["ember.js"],
134 },
135 ]
136 }
137
138 fn detect_framework(document: &Html, html: &str, signature: &FrameworkSignature) -> bool {
140 for marker in &signature.html_markers {
142 if html.contains(marker) {
143 return true;
144 }
145 }
146
147 if let Ok(selector) = Selector::parse("script[src]") {
149 for element in document.select(&selector) {
150 if let Some(src) = element.value().attr("src") {
151 for pattern in &signature.script_patterns {
152 if src.to_lowercase().contains(pattern) {
153 return true;
154 }
155 }
156 }
157 }
158 }
159
160 false
161 }
162
163 fn has_lazy_loading_indicators(document: &Html, html: &str) -> bool {
165 let lazy_patterns = vec![
167 "data-lazy",
168 "data-src",
169 "loading=\"lazy\"",
170 "loading='lazy'",
171 "lazy-load",
172 "data-original",
173 "data-lazy-src",
174 ];
175
176 for pattern in lazy_patterns {
177 if html.contains(pattern) {
178 return true;
179 }
180 }
181
182 if let Ok(selector) = Selector::parse("script") {
184 for element in document.select(&selector) {
185 let script_text = element.text().collect::<String>();
186 if script_text.contains("IntersectionObserver")
187 || script_text.contains("getBoundingClientRect")
188 {
189 return true;
190 }
191 }
192 }
193
194 false
195 }
196
197 fn has_spa_routing(document: &Html, html: &str) -> bool {
199 let routing_patterns = vec![
201 "react-router",
202 "vue-router",
203 "angular/router",
204 "@reach/router",
205 "history.pushState",
206 "history.replaceState",
207 ];
208
209 for pattern in routing_patterns {
210 if html.contains(pattern) {
211 return true;
212 }
213 }
214
215 if let Ok(selector) = Selector::parse("a[href^='#/']") {
217 if document.select(&selector).count() > 0 {
218 return true;
219 }
220 }
221
222 false
223 }
224
225 fn calculate_content_script_ratio(document: &Html, _html: &str) -> f64 {
227 let mut script_size = 0;
229 if let Ok(selector) = Selector::parse("script") {
230 for element in document.select(&selector) {
231 let script_text = element.text().collect::<String>();
232 script_size += script_text.len();
233
234 if let Some(src) = element.value().attr("src") {
236 script_size += src.len() * 10; }
238 }
239 }
240
241 let body_text = document
243 .root_element()
244 .text()
245 .collect::<String>()
246 .trim()
247 .to_string();
248
249 let content_size = body_text.len();
250
251 if script_size == 0 {
252 return 1.0;
253 }
254
255 content_size as f64 / (content_size + script_size) as f64
256 }
257
258 fn has_minimal_content(document: &Html) -> bool {
260 let body_text = document
262 .root_element()
263 .text()
264 .collect::<String>()
265 .trim()
266 .to_string();
267
268 if body_text.len() < 100 {
270 return true;
271 }
272
273 let spa_roots = vec!["#root", "#app", "#__next", "#application"];
275 for root_id in spa_roots {
276 if let Ok(selector) = Selector::parse(root_id) {
277 if let Some(root) = document.select(&selector).next() {
278 let root_text = root.text().collect::<String>().trim().to_string();
279 if root_text.is_empty() || root_text.len() < 50 {
280 return true;
281 }
282 }
283 }
284 }
285
286 false
287 }
288
289 fn has_hydration_markers(html: &str) -> bool {
291 let hydration_markers = vec![
292 "data-reactid",
293 "data-react-checksum",
294 "data-server-rendered",
295 "__NEXT_DATA__",
296 "__NUXT__",
297 "data-hydrate",
298 ];
299
300 for marker in hydration_markers {
301 if html.contains(marker) {
302 return true;
303 }
304 }
305
306 false
307 }
308
309 pub fn analyze_page(html: &str, url: &str) -> String {
311 let result = Self::needs_javascript(html, url);
312
313 let mut report = String::new();
314 report.push_str(&format!("URL: {}\n", url));
315 report.push_str(&format!("Needs JavaScript: {}\n", result.needs_js));
316 report.push_str(&format!("Reason: {}\n", result.reason));
317 report.push_str(&format!("Content/Script Ratio: {:.2}\n", result.content_script_ratio));
318
319 if !result.detected_frameworks.is_empty() {
320 report.push_str(&format!(
321 "Detected Frameworks: {}\n",
322 result.detected_frameworks.join(", ")
323 ));
324 }
325
326 report
327 }
328}
329
330#[cfg(test)]
331mod tests {
332 use super::*;
333
334 #[test]
335 fn test_detect_react() {
336 let html = r#"
337 <!DOCTYPE html>
338 <html>
339 <head></head>
340 <body>
341 <div id="root"></div>
342 <script>window.__REACT_DEVTOOLS_GLOBAL_HOOK__ = {}</script>
343 </body>
344 </html>
345 "#;
346 let result = RenderingDetector::needs_javascript(html, "https://example.com");
347 assert!(result.needs_js);
348 assert!(result.detected_frameworks.contains(&"React".to_string()));
349 }
350
351 #[test]
352 fn test_detect_nextjs() {
353 let html = r#"
354 <!DOCTYPE html>
355 <html>
356 <head></head>
357 <body>
358 <div id="__next"></div>
359 <script id="__NEXT_DATA__" type="application/json">{}</script>
360 </body>
361 </html>
362 "#;
363 let result = RenderingDetector::needs_javascript(html, "https://example.com");
364 assert!(result.needs_js);
365 assert!(result.detected_frameworks.contains(&"Next.js".to_string()));
366 }
367
368 #[test]
369 fn test_detect_vue() {
370 let html = r#"
371 <!DOCTYPE html>
372 <html>
373 <head></head>
374 <body>
375 <div id="app" data-v-123></div>
376 <script src="/vue.runtime.js"></script>
377 </body>
378 </html>
379 "#;
380 let result = RenderingDetector::needs_javascript(html, "https://example.com");
381 assert!(result.needs_js);
382 assert!(result.detected_frameworks.contains(&"Vue".to_string()));
383 }
384
385 #[test]
386 fn test_detect_lazy_loading() {
387 let html = r#"
388 <!DOCTYPE html>
389 <html>
390 <body>
391 <img data-lazy-src="image.jpg" />
392 <p>Some content here to make it substantial enough.</p>
393 </body>
394 </html>
395 "#;
396 let result = RenderingDetector::needs_javascript(html, "https://example.com");
397 assert!(result.needs_js);
398 assert!(result.reason.contains("Lazy loading"));
399 }
400
401 #[test]
402 fn test_static_content() {
403 let html = r#"
404 <!DOCTYPE html>
405 <html>
406 <head><title>Regular Page</title></head>
407 <body>
408 <h1>Welcome to Our Website</h1>
409 <p>This is a regular HTML page with plenty of content that is not a SPA.</p>
410 <p>It has multiple paragraphs and elements that provide substantial content.</p>
411 <article>
412 <h2>Article Title</h2>
413 <p>Article content goes here with enough text to be considered substantial.</p>
414 </article>
415 </body>
416 </html>
417 "#;
418 let result = RenderingDetector::needs_javascript(html, "https://example.com");
419 assert!(!result.needs_js);
420 assert!(result.reason.contains("Static content"));
421 }
422
423 #[test]
424 fn test_minimal_content() {
425 let html = r#"
426 <!DOCTYPE html>
427 <html>
428 <head><title>App</title></head>
429 <body>
430 <div id="root"></div>
431 </body>
432 </html>
433 "#;
434 let result = RenderingDetector::needs_javascript(html, "https://example.com");
435 assert!(result.needs_js);
436 assert!(result.reason.contains("Minimal initial content"));
437 }
438
439 #[test]
440 fn test_content_script_ratio() {
441 let html = r#"
442 <!DOCTYPE html>
443 <html>
444 <body>
445 <p>A bit of content</p>
446 <script>
447 // Lots of JavaScript code here
448 var x = 1; var y = 2; var z = 3;
449 function test() {
450 console.log("This is a long script to test ratio with lots of code");
451 console.log("More code here to make it substantial");
452 console.log("Even more code to increase the ratio");
453 console.log("And more JavaScript to ensure low content ratio");
454 console.log("Additional script content here");
455 console.log("Even more script content");
456 var longVariable = "This is a long string to add more script content";
457 var anotherVariable = "And another one for good measure";
458 }
459 </script>
460 <script src="https://example.com/very-long-path-to-external-script.js"></script>
461 <script src="https://example.com/another-external-script-with-long-path.js"></script>
462 </body>
463 </html>
464 "#;
465 let result = RenderingDetector::needs_javascript(html, "https://example.com");
466 assert!(result.content_script_ratio < 0.8); }
469}