Skip to main content

essence/engines/
detection.rs

1use scraper::{Html, Selector};
2
3/// JavaScript rendering detection with framework signature detection
4pub struct RenderingDetector;
5
6/// Detection result with reasoning
7#[derive(Debug, Clone)]
8pub struct DetectionResult {
9    /// Whether JavaScript rendering is needed
10    pub needs_js: bool,
11    /// Reason for the decision
12    pub reason: String,
13    /// Detected frameworks
14    pub detected_frameworks: Vec<String>,
15    /// Content-to-script ratio
16    pub content_script_ratio: f64,
17}
18
19/// Framework signatures for detection
20#[derive(Debug)]
21struct FrameworkSignature {
22    name: &'static str,
23    html_markers: Vec<&'static str>,
24    script_patterns: Vec<&'static str>,
25}
26
27impl RenderingDetector {
28    /// Check if JavaScript rendering is needed
29    pub fn needs_javascript(html: &str, _url: &str) -> DetectionResult {
30        let document = Html::parse_document(html);
31        let mut detected_frameworks = Vec::new();
32        let mut reasons = Vec::new();
33
34        // Check for framework signatures
35        let frameworks = Self::get_framework_signatures();
36        for framework in frameworks {
37            if Self::detect_framework(&document, html, &framework) {
38                detected_frameworks.push(framework.name.to_string());
39                reasons.push(format!("{} framework detected", framework.name));
40            }
41        }
42
43        // Check for lazy loading indicators
44        if Self::has_lazy_loading_indicators(&document, html) {
45            reasons.push("Lazy loading detected".to_string());
46        }
47
48        // Check for SPA routing
49        if Self::has_spa_routing(&document, html) {
50            reasons.push("SPA routing detected".to_string());
51        }
52
53        // Calculate content-to-script ratio
54        let content_script_ratio = Self::calculate_content_script_ratio(&document, html);
55        if content_script_ratio < 0.5 {
56            reasons.push(format!(
57                "Low content-to-script ratio: {:.2}",
58                content_script_ratio
59            ));
60        }
61
62        // Check for minimal content (SPA shell)
63        if Self::has_minimal_content(&document) {
64            reasons.push("Minimal initial content (SPA shell)".to_string());
65        }
66
67        // Check for hydration markers
68        if Self::has_hydration_markers(html) {
69            reasons.push("Hydration markers detected".to_string());
70        }
71
72        let needs_js = !reasons.is_empty() || !detected_frameworks.is_empty();
73        let reason = if needs_js {
74            reasons.join("; ")
75        } else {
76            "Static content with sufficient initial HTML".to_string()
77        };
78
79        DetectionResult {
80            needs_js,
81            reason,
82            detected_frameworks,
83            content_script_ratio,
84        }
85    }
86
87    /// Get framework signatures
88    fn get_framework_signatures() -> Vec<FrameworkSignature> {
89        vec![
90            FrameworkSignature {
91                name: "React",
92                html_markers: vec![
93                    "__REACT_DEVTOOLS_GLOBAL_HOOK__",
94                    "data-reactroot",
95                    "data-react-helmet",
96                    "react-root",
97                ],
98                script_patterns: vec!["react", "react-dom"],
99            },
100            FrameworkSignature {
101                name: "Next.js",
102                html_markers: vec!["__NEXT_DATA__", "_N_E", "__next"],
103                script_patterns: vec!["_next/static", "next/dist"],
104            },
105            FrameworkSignature {
106                name: "Vue",
107                html_markers: vec!["data-v-", "__VUE__", "data-server-rendered"],
108                script_patterns: vec!["vue.js", "vue.runtime"],
109            },
110            FrameworkSignature {
111                name: "Nuxt",
112                html_markers: vec!["__NUXT__", "$nuxt", "nuxt-link"],
113                script_patterns: vec!["_nuxt/"],
114            },
115            FrameworkSignature {
116                name: "Angular",
117                html_markers: vec!["ng-version", "_nghost", "_ngcontent"],
118                script_patterns: vec!["angular", "@angular"],
119            },
120            FrameworkSignature {
121                name: "Svelte",
122                html_markers: vec!["svelte-"],
123                script_patterns: vec!["svelte"],
124            },
125            FrameworkSignature {
126                name: "Gatsby",
127                html_markers: vec!["___gatsby", "gatsby-"],
128                script_patterns: vec!["webpack-runtime"],
129            },
130            FrameworkSignature {
131                name: "Ember",
132                html_markers: vec!["ember-application", "ember-view"],
133                script_patterns: vec!["ember.js"],
134            },
135        ]
136    }
137
138    /// Detect if a specific framework is present
139    fn detect_framework(document: &Html, html: &str, signature: &FrameworkSignature) -> bool {
140        // Check HTML markers
141        for marker in &signature.html_markers {
142            if html.contains(marker) {
143                return true;
144            }
145        }
146
147        // Check script src patterns
148        if let Ok(selector) = Selector::parse("script[src]") {
149            for element in document.select(&selector) {
150                if let Some(src) = element.value().attr("src") {
151                    for pattern in &signature.script_patterns {
152                        if src.to_lowercase().contains(pattern) {
153                            return true;
154                        }
155                    }
156                }
157            }
158        }
159
160        false
161    }
162
163    /// Check for lazy loading indicators
164    fn has_lazy_loading_indicators(document: &Html, html: &str) -> bool {
165        // Check for lazy loading attributes
166        let lazy_patterns = vec![
167            "data-lazy",
168            "data-src",
169            "loading=\"lazy\"",
170            "loading='lazy'",
171            "lazy-load",
172            "data-original",
173            "data-lazy-src",
174        ];
175
176        for pattern in lazy_patterns {
177            if html.contains(pattern) {
178                return true;
179            }
180        }
181
182        // Check for intersection observer (common lazy loading technique)
183        if let Ok(selector) = Selector::parse("script") {
184            for element in document.select(&selector) {
185                let script_text = element.text().collect::<String>();
186                if script_text.contains("IntersectionObserver")
187                    || script_text.contains("getBoundingClientRect")
188                {
189                    return true;
190                }
191            }
192        }
193
194        false
195    }
196
197    /// Check for SPA routing patterns
198    fn has_spa_routing(document: &Html, html: &str) -> bool {
199        // Check for client-side routing libraries
200        let routing_patterns = vec![
201            "react-router",
202            "vue-router",
203            "angular/router",
204            "@reach/router",
205            "history.pushState",
206            "history.replaceState",
207        ];
208
209        for pattern in routing_patterns {
210            if html.contains(pattern) {
211                return true;
212            }
213        }
214
215        // Check for hash-based routing
216        if let Ok(selector) = Selector::parse("a[href^='#/']") {
217            if document.select(&selector).count() > 0 {
218                return true;
219            }
220        }
221
222        false
223    }
224
225    /// Calculate content-to-script ratio
226    fn calculate_content_script_ratio(document: &Html, _html: &str) -> f64 {
227        // Get all script content
228        let mut script_size = 0;
229        if let Ok(selector) = Selector::parse("script") {
230            for element in document.select(&selector) {
231                let script_text = element.text().collect::<String>();
232                script_size += script_text.len();
233                
234                // Also count inline scripts from src length estimation
235                if let Some(src) = element.value().attr("src") {
236                    script_size += src.len() * 10; // Estimate external script impact
237                }
238            }
239        }
240
241        // Get body text content
242        let body_text = document
243            .root_element()
244            .text()
245            .collect::<String>()
246            .trim()
247            .to_string();
248        
249        let content_size = body_text.len();
250
251        if script_size == 0 {
252            return 1.0;
253        }
254
255        content_size as f64 / (content_size + script_size) as f64
256    }
257
258    /// Check for minimal content (SPA shell)
259    fn has_minimal_content(document: &Html) -> bool {
260        // Get visible text content
261        let body_text = document
262            .root_element()
263            .text()
264            .collect::<String>()
265            .trim()
266            .to_string();
267
268        // If body has very little text (< 100 chars), it's likely a SPA shell
269        if body_text.len() < 100 {
270            return true;
271        }
272
273        // Check for common SPA root elements with minimal content
274        let spa_roots = vec!["#root", "#app", "#__next", "#application"];
275        for root_id in spa_roots {
276            if let Ok(selector) = Selector::parse(root_id) {
277                if let Some(root) = document.select(&selector).next() {
278                    let root_text = root.text().collect::<String>().trim().to_string();
279                    if root_text.is_empty() || root_text.len() < 50 {
280                        return true;
281                    }
282                }
283            }
284        }
285
286        false
287    }
288
289    /// Check for hydration markers
290    fn has_hydration_markers(html: &str) -> bool {
291        let hydration_markers = vec![
292            "data-reactid",
293            "data-react-checksum",
294            "data-server-rendered",
295            "__NEXT_DATA__",
296            "__NUXT__",
297            "data-hydrate",
298        ];
299
300        for marker in hydration_markers {
301            if html.contains(marker) {
302                return true;
303            }
304        }
305
306        false
307    }
308
309    /// Get a detailed analysis report
310    pub fn analyze_page(html: &str, url: &str) -> String {
311        let result = Self::needs_javascript(html, url);
312        
313        let mut report = String::new();
314        report.push_str(&format!("URL: {}\n", url));
315        report.push_str(&format!("Needs JavaScript: {}\n", result.needs_js));
316        report.push_str(&format!("Reason: {}\n", result.reason));
317        report.push_str(&format!("Content/Script Ratio: {:.2}\n", result.content_script_ratio));
318        
319        if !result.detected_frameworks.is_empty() {
320            report.push_str(&format!(
321                "Detected Frameworks: {}\n",
322                result.detected_frameworks.join(", ")
323            ));
324        }
325        
326        report
327    }
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333
334    #[test]
335    fn test_detect_react() {
336        let html = r#"
337            <!DOCTYPE html>
338            <html>
339            <head></head>
340            <body>
341                <div id="root"></div>
342                <script>window.__REACT_DEVTOOLS_GLOBAL_HOOK__ = {}</script>
343            </body>
344            </html>
345        "#;
346        let result = RenderingDetector::needs_javascript(html, "https://example.com");
347        assert!(result.needs_js);
348        assert!(result.detected_frameworks.contains(&"React".to_string()));
349    }
350
351    #[test]
352    fn test_detect_nextjs() {
353        let html = r#"
354            <!DOCTYPE html>
355            <html>
356            <head></head>
357            <body>
358                <div id="__next"></div>
359                <script id="__NEXT_DATA__" type="application/json">{}</script>
360            </body>
361            </html>
362        "#;
363        let result = RenderingDetector::needs_javascript(html, "https://example.com");
364        assert!(result.needs_js);
365        assert!(result.detected_frameworks.contains(&"Next.js".to_string()));
366    }
367
368    #[test]
369    fn test_detect_vue() {
370        let html = r#"
371            <!DOCTYPE html>
372            <html>
373            <head></head>
374            <body>
375                <div id="app" data-v-123></div>
376                <script src="/vue.runtime.js"></script>
377            </body>
378            </html>
379        "#;
380        let result = RenderingDetector::needs_javascript(html, "https://example.com");
381        assert!(result.needs_js);
382        assert!(result.detected_frameworks.contains(&"Vue".to_string()));
383    }
384
385    #[test]
386    fn test_detect_lazy_loading() {
387        let html = r#"
388            <!DOCTYPE html>
389            <html>
390            <body>
391                <img data-lazy-src="image.jpg" />
392                <p>Some content here to make it substantial enough.</p>
393            </body>
394            </html>
395        "#;
396        let result = RenderingDetector::needs_javascript(html, "https://example.com");
397        assert!(result.needs_js);
398        assert!(result.reason.contains("Lazy loading"));
399    }
400
401    #[test]
402    fn test_static_content() {
403        let html = r#"
404            <!DOCTYPE html>
405            <html>
406            <head><title>Regular Page</title></head>
407            <body>
408                <h1>Welcome to Our Website</h1>
409                <p>This is a regular HTML page with plenty of content that is not a SPA.</p>
410                <p>It has multiple paragraphs and elements that provide substantial content.</p>
411                <article>
412                    <h2>Article Title</h2>
413                    <p>Article content goes here with enough text to be considered substantial.</p>
414                </article>
415            </body>
416            </html>
417        "#;
418        let result = RenderingDetector::needs_javascript(html, "https://example.com");
419        assert!(!result.needs_js);
420        assert!(result.reason.contains("Static content"));
421    }
422
423    #[test]
424    fn test_minimal_content() {
425        let html = r#"
426            <!DOCTYPE html>
427            <html>
428            <head><title>App</title></head>
429            <body>
430                <div id="root"></div>
431            </body>
432            </html>
433        "#;
434        let result = RenderingDetector::needs_javascript(html, "https://example.com");
435        assert!(result.needs_js);
436        assert!(result.reason.contains("Minimal initial content"));
437    }
438
439    #[test]
440    fn test_content_script_ratio() {
441        let html = r#"
442            <!DOCTYPE html>
443            <html>
444            <body>
445                <p>A bit of content</p>
446                <script>
447                    // Lots of JavaScript code here
448                    var x = 1; var y = 2; var z = 3;
449                    function test() {
450                        console.log("This is a long script to test ratio with lots of code");
451                        console.log("More code here to make it substantial");
452                        console.log("Even more code to increase the ratio");
453                        console.log("And more JavaScript to ensure low content ratio");
454                        console.log("Additional script content here");
455                        console.log("Even more script content");
456                        var longVariable = "This is a long string to add more script content";
457                        var anotherVariable = "And another one for good measure";
458                    }
459                </script>
460                <script src="https://example.com/very-long-path-to-external-script.js"></script>
461                <script src="https://example.com/another-external-script-with-long-path.js"></script>
462            </body>
463            </html>
464        "#;
465        let result = RenderingDetector::needs_javascript(html, "https://example.com");
466        // With external scripts, the ratio should be low
467        assert!(result.content_script_ratio < 0.8); // More lenient assertion
468    }
469}