halldyll_core/render/
decision.rs

1//! Decision - Decision to switch to a headless browser
2
3use scraper::{Html, Selector};
4
5/// Render decision
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum RenderDecision {
8    /// Static HTML is sufficient
9    Static,
10    /// Requires a headless browser
11    NeedsBrowser(BrowserReason),
12}
13
14/// Reason for requiring a browser
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub enum BrowserReason {
17    /// Empty or nearly empty page
18    EmptyContent,
19    /// Many scripts
20    HeavyScripts,
21    /// JS frameworks detected (React, Vue, Angular, etc.)
22    JsFramework(String),
23    /// Unresolvable lazy loading
24    LazyLoading,
25    /// Required selectors not found
26    MissingSelectors,
27    /// SPA detected
28    SinglePageApp,
29}
30
31/// Render checker
32pub struct RenderChecker {
33    /// Script threshold to consider "heavy"
34    script_threshold: usize,
35    /// Minimum content threshold (characters)
36    min_content_length: usize,
37    /// Required selectors (if all absent, browser needed)
38    required_selectors: Vec<String>,
39}
40
41impl Default for RenderChecker {
42    fn default() -> Self {
43        Self {
44            script_threshold: 10,
45            min_content_length: 500,
46            required_selectors: Vec::new(),
47        }
48    }
49}
50
51impl RenderChecker {
52    /// New checker
53    pub fn new() -> Self {
54        Self::default()
55    }
56
57    /// Configure the script threshold
58    pub fn with_script_threshold(mut self, threshold: usize) -> Self {
59        self.script_threshold = threshold;
60        self
61    }
62
63    /// Configure the minimum content
64    pub fn with_min_content(mut self, min_length: usize) -> Self {
65        self.min_content_length = min_length;
66        self
67    }
68
69    /// Configure the required selectors
70    pub fn with_required_selectors(mut self, selectors: Vec<String>) -> Self {
71        self.required_selectors = selectors;
72        self
73    }
74
75    /// Check if JS rendering is necessary
76    pub fn check(&self, html: &str) -> RenderDecision {
77        let document = Html::parse_document(html);
78
79        // 1. Check if page is empty
80        if let Some(reason) = self.check_empty_content(&document) {
81            return RenderDecision::NeedsBrowser(reason);
82        }
83
84        // 2. Check JS frameworks
85        if let Some(reason) = self.check_js_frameworks(html) {
86            return RenderDecision::NeedsBrowser(reason);
87        }
88
89        // 3. Check number of scripts
90        if let Some(reason) = self.check_heavy_scripts(&document) {
91            return RenderDecision::NeedsBrowser(reason);
92        }
93
94        // 4. Check required selectors
95        if let Some(reason) = self.check_required_selectors(&document) {
96            return RenderDecision::NeedsBrowser(reason);
97        }
98
99        // 5. Check SPA patterns
100        if let Some(reason) = self.check_spa_patterns(&document) {
101            return RenderDecision::NeedsBrowser(reason);
102        }
103
104        RenderDecision::Static
105    }
106
107    /// Check if content is empty
108    fn check_empty_content(&self, document: &Html) -> Option<BrowserReason> {
109        // Count text in body
110        let body_selector = Selector::parse("body").ok()?;
111        let body = document.select(&body_selector).next()?;
112        
113        // Exclude scripts and styles
114        let text: String = body
115            .text()
116            .collect::<Vec<_>>()
117            .join(" ")
118            .split_whitespace()
119            .collect::<Vec<_>>()
120            .join(" ");
121
122        if text.len() < self.min_content_length {
123            return Some(BrowserReason::EmptyContent);
124        }
125
126        None
127    }
128
129    /// Detect JS frameworks
130    fn check_js_frameworks(&self, html: &str) -> Option<BrowserReason> {
131        let html_lower = html.to_lowercase();
132
133        // React
134        if html_lower.contains("__react") 
135            || html_lower.contains("data-reactroot")
136            || html_lower.contains("_reactrootcontainer") 
137        {
138            return Some(BrowserReason::JsFramework("React".to_string()));
139        }
140
141        // Vue
142        if html_lower.contains("data-v-") || html_lower.contains("__vue__") {
143            return Some(BrowserReason::JsFramework("Vue".to_string()));
144        }
145
146        // Angular
147        if html_lower.contains("ng-version") || html_lower.contains("ng-app") {
148            return Some(BrowserReason::JsFramework("Angular".to_string()));
149        }
150
151        // Next.js
152        if html_lower.contains("__next") || html_lower.contains("_next/static") {
153            return Some(BrowserReason::JsFramework("Next.js".to_string()));
154        }
155
156        // Nuxt.js
157        if html_lower.contains("__nuxt") || html_lower.contains("_nuxt/") {
158            return Some(BrowserReason::JsFramework("Nuxt.js".to_string()));
159        }
160
161        // Svelte
162        if html_lower.contains("svelte-") {
163            return Some(BrowserReason::JsFramework("Svelte".to_string()));
164        }
165
166        None
167    }
168
169    /// Check the number of scripts
170    fn check_heavy_scripts(&self, document: &Html) -> Option<BrowserReason> {
171        let script_selector = Selector::parse("script[src]").ok()?;
172        let script_count = document.select(&script_selector).count();
173
174        if script_count > self.script_threshold {
175            return Some(BrowserReason::HeavyScripts);
176        }
177
178        None
179    }
180
181    /// Check required selectors
182    fn check_required_selectors(&self, document: &Html) -> Option<BrowserReason> {
183        if self.required_selectors.is_empty() {
184            return None;
185        }
186
187        for selector_str in &self.required_selectors {
188            if let Ok(selector) = Selector::parse(selector_str) {
189                if document.select(&selector).next().is_some() {
190                    return None; // At least one selector found
191                }
192            }
193        }
194
195        // No required selector found
196        Some(BrowserReason::MissingSelectors)
197    }
198
199    /// Detect SPA patterns
200    fn check_spa_patterns(&self, document: &Html) -> Option<BrowserReason> {
201        // Single div as main container
202        let app_selector = Selector::parse("#app, #root, #__next, #__nuxt").ok()?;
203        let app_div = document.select(&app_selector).next()?;
204
205        // Check if the div is almost empty
206        let text: String = app_div.text().collect::<Vec<_>>().join("");
207        let text = text.trim();
208
209        if text.len() < 100 {
210            return Some(BrowserReason::SinglePageApp);
211        }
212
213        None
214    }
215}
216
217/// Indicators for required rendering
218#[derive(Debug, Clone, Default)]
219pub struct RenderIndicators {
220    /// External scripts
221    pub external_scripts: usize,
222    /// Inline scripts
223    pub inline_scripts: usize,
224    /// Lazy loading detected
225    pub has_lazy_loading: bool,
226    /// Framework detected
227    pub detected_framework: Option<String>,
228    /// Text content (characters)
229    pub text_content_length: usize,
230    /// Empty app/root div
231    pub empty_app_container: bool,
232}
233
234/// Analyze render indicators
235pub fn analyze_render_indicators(html: &str) -> RenderIndicators {
236    let document = Html::parse_document(html);
237    let mut indicators = RenderIndicators::default();
238
239    // Compter les scripts
240    if let Ok(sel) = Selector::parse("script[src]") {
241        indicators.external_scripts = document.select(&sel).count();
242    }
243    if let Ok(sel) = Selector::parse("script:not([src])") {
244        indicators.inline_scripts = document.select(&sel).count();
245    }
246
247    // Lazy loading
248    if let Ok(sel) = Selector::parse("[data-src], [data-lazy], [loading='lazy']") {
249        indicators.has_lazy_loading = document.select(&sel).next().is_some();
250    }
251
252    // Framework
253    let checker = RenderChecker::new();
254    if let Some(BrowserReason::JsFramework(fw)) = checker.check_js_frameworks(html) {
255        indicators.detected_framework = Some(fw);
256    }
257
258    // Contenu textuel
259    if let Ok(body_sel) = Selector::parse("body") {
260        if let Some(body) = document.select(&body_sel).next() {
261            indicators.text_content_length = body
262                .text()
263                .collect::<Vec<_>>()
264                .join("")
265                .trim()
266                .len();
267        }
268    }
269
270    // Container vide
271    if let Ok(sel) = Selector::parse("#app, #root") {
272        if let Some(container) = document.select(&sel).next() {
273            let text: String = container.text().collect();
274            indicators.empty_app_container = text.trim().len() < 50;
275        }
276    }
277
278    indicators
279}