Skip to main content

tirith_core/rules/
cloaking.rs

1/// Server-side cloaking detection — Unix only.
2///
3/// Fetches a URL with multiple user-agents and compares responses to detect
4/// content differentiation (serving different content to AI bots vs browsers).
5#[cfg(unix)]
6use crate::verdict::{Evidence, Finding, RuleId, Severity};
7
8/// User-agent profiles for cloaking detection.
9#[cfg(unix)]
10const USER_AGENTS: &[(&str, &str)] = &[
11    ("chrome", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"),
12    ("claudebot", "ClaudeBot/1.0"),
13    ("chatgpt", "ChatGPT-User"),
14    ("perplexity", "PerplexityBot/1.0"),
15    ("googlebot", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"),
16    ("curl", "curl/8.7.1"),
17];
18
19/// Result of a cloaking check.
20#[cfg(unix)]
21pub struct CloakingResult {
22    pub url: String,
23    pub cloaking_detected: bool,
24    pub findings: Vec<Finding>,
25    /// Per-agent response summaries (agent name, status code, content length).
26    pub agent_responses: Vec<AgentResponse>,
27    /// Pairs of agents whose responses differed significantly.
28    pub diff_pairs: Vec<DiffPair>,
29}
30
31#[cfg(unix)]
32pub struct AgentResponse {
33    pub agent_name: String,
34    pub status_code: u16,
35    pub content_length: usize,
36}
37
38#[cfg(unix)]
39pub struct DiffPair {
40    pub agent_a: String,
41    pub agent_b: String,
42    pub diff_chars: usize,
43    /// Full diff text (populated for Pro enrichment).
44    pub diff_text: Option<String>,
45}
46
47#[cfg(unix)]
48impl CloakingResult {
49    /// Serialize to JSON. When `include_diff_text` is true (Pro tier), diff text
50    /// is included in the output; otherwise it is omitted.
51    pub fn to_json(&self, include_diff_text: bool) -> serde_json::Value {
52        serde_json::json!({
53            "url": self.url,
54            "cloaking_detected": self.cloaking_detected,
55            "agents": self.agent_responses.iter().map(|a| {
56                serde_json::json!({
57                    "agent": a.agent_name,
58                    "status_code": a.status_code,
59                    "content_length": a.content_length,
60                })
61            }).collect::<Vec<_>>(),
62            "diffs": self.diff_pairs.iter().map(|d| {
63                let mut entry = serde_json::json!({
64                    "agent_a": d.agent_a,
65                    "agent_b": d.agent_b,
66                    "diff_chars": d.diff_chars,
67                });
68                if include_diff_text {
69                    if let Some(ref text) = d.diff_text {
70                        entry.as_object_mut().unwrap().insert(
71                            "diff_text".into(),
72                            serde_json::json!(text),
73                        );
74                    }
75                }
76                entry
77            }).collect::<Vec<_>>(),
78            "findings": self.findings,
79        })
80    }
81}
82
83/// Check a URL for server-side cloaking.
84#[cfg(unix)]
85pub fn check(url: &str) -> Result<CloakingResult, String> {
86    let validated_url = crate::url_validate::validate_fetch_url(url)?;
87    let client = reqwest::blocking::Client::builder()
88        .timeout(std::time::Duration::from_secs(30))
89        .redirect(reqwest::redirect::Policy::custom(|attempt| {
90            if attempt.previous().len() > 10 {
91                attempt.error("too many redirects")
92            } else if let Err(reason) =
93                crate::url_validate::validate_fetch_url(attempt.url().as_str())
94            {
95                attempt.error(reason)
96            } else {
97                attempt.follow()
98            }
99        }))
100        .build()
101        .map_err(|e| format!("HTTP client error: {e}"))?;
102
103    const MAX_BODY: usize = 10 * 1024 * 1024; // 10 MiB
104
105    let mut responses: Vec<(String, u16, String)> = Vec::new();
106
107    for (name, ua) in USER_AGENTS {
108        match fetch_with_ua(&client, validated_url.as_str(), ua, MAX_BODY) {
109            Ok((status, body)) => {
110                responses.push((name.to_string(), status, body));
111            }
112            Err(e) => {
113                eprintln!("tirith: cloaking: {name} fetch failed: {e}");
114                responses.push((name.to_string(), 0, String::new()));
115            }
116        }
117    }
118
119    let successful_count = responses.iter().filter(|(_, s, _)| *s != 0).count();
120    if successful_count == 0 {
121        return Err("all user-agent fetches failed — cannot perform cloaking analysis".to_string());
122    }
123
124    // chrome is the baseline (USER_AGENTS[0]); other agents are compared against it.
125    let baseline_idx = 0;
126    let baseline_body = &responses[baseline_idx].2;
127
128    // If the baseline fetch failed we'd otherwise flag every successful agent as cloaked.
129    if baseline_body.is_empty() {
130        let agent_responses: Vec<AgentResponse> = responses
131            .iter()
132            .map(|(name, status, body)| AgentResponse {
133                agent_name: name.clone(),
134                status_code: *status,
135                content_length: body.len(),
136            })
137            .collect();
138        return Ok(CloakingResult {
139            url: url.to_string(),
140            cloaking_detected: false,
141            findings: Vec::new(),
142            agent_responses,
143            diff_pairs: Vec::new(),
144        });
145    }
146
147    let baseline_normalized = normalize_html(baseline_body);
148
149    let mut diff_pairs = Vec::new();
150    let mut cloaking_detected = false;
151
152    let agent_responses: Vec<AgentResponse> = responses
153        .iter()
154        .map(|(name, status, body)| AgentResponse {
155            agent_name: name.clone(),
156            status_code: *status,
157            content_length: body.len(),
158        })
159        .collect();
160
161    for (i, (name, _status, body)) in responses.iter().enumerate() {
162        if i == baseline_idx {
163            continue;
164        }
165        if body.is_empty() {
166            continue;
167        }
168
169        let normalized = normalize_html(body);
170        let diff_chars = word_diff_size(&baseline_normalized, &normalized);
171
172        if diff_chars > 10 {
173            cloaking_detected = true;
174            let diff_detail = generate_diff_text(&baseline_normalized, &normalized);
175            diff_pairs.push(DiffPair {
176                agent_a: "chrome".to_string(),
177                agent_b: name.clone(),
178                diff_chars,
179                diff_text: Some(diff_detail),
180            });
181        }
182    }
183
184    let mut findings = Vec::new();
185    if cloaking_detected {
186        let differing: Vec<&str> = diff_pairs.iter().map(|d| d.agent_b.as_str()).collect();
187        findings.push(Finding {
188            rule_id: RuleId::ServerCloaking,
189            severity: Severity::High,
190            title: "Server-side cloaking detected".to_string(),
191            description: format!(
192                "URL serves different content to different user-agents. \
193                 Differing agents: {}",
194                differing.join(", ")
195            ),
196            evidence: diff_pairs
197                .iter()
198                .map(|d| Evidence::Text {
199                    detail: format!(
200                        "{} vs {}: {} chars different",
201                        d.agent_a, d.agent_b, d.diff_chars
202                    ),
203                })
204                .collect(),
205            human_view: None,
206            agent_view: None,
207            mitre_id: None,
208            custom_rule_id: None,
209        });
210    }
211
212    Ok(CloakingResult {
213        url: url.to_string(),
214        cloaking_detected,
215        findings,
216        agent_responses,
217        diff_pairs,
218    })
219}
220
221#[cfg(unix)]
222fn fetch_with_ua(
223    client: &reqwest::blocking::Client,
224    url: &str,
225    ua: &str,
226    max_body: usize,
227) -> Result<(u16, String), String> {
228    let response = client
229        .get(url)
230        .header("User-Agent", ua)
231        .send()
232        .map_err(|e| format!("request failed: {e}"))?;
233
234    let status = response.status().as_u16();
235
236    if let Some(len) = response.content_length() {
237        if len > max_body as u64 {
238            return Err(format!("response too large: {len} bytes"));
239        }
240    }
241
242    // Belt-and-braces cap on the actual stream — Content-Length may be missing or lying.
243    use std::io::Read as _;
244    let mut body_bytes = Vec::with_capacity(max_body.min(1024 * 1024));
245    response
246        .take((max_body as u64) + 1)
247        .read_to_end(&mut body_bytes)
248        .map_err(|e| format!("read body: {e}"))?;
249    if body_bytes.len() > max_body {
250        return Err(format!("response too large: {} bytes", body_bytes.len()));
251    }
252
253    let body = String::from_utf8_lossy(&body_bytes).into_owned();
254    Ok((status, body))
255}
256
257/// Normalize HTML for comparison — strip volatile content that changes
258/// between requests (scripts, styles, CSRF tokens, nonces, timestamps).
259#[cfg(unix)]
260fn normalize_html(input: &str) -> String {
261    use once_cell::sync::Lazy;
262    use regex::Regex;
263
264    static SCRIPT: Lazy<Regex> =
265        Lazy::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap());
266    static STYLE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap());
267    static NONCE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)\bnonce="[^"]*""#).unwrap());
268    static CSRF: Lazy<Regex> =
269        Lazy::new(|| Regex::new(r#"(?i)<[^>]*csrf[_-]?token[^>]*>"#).unwrap());
270    static WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
271
272    let s = SCRIPT.replace_all(input, "");
273    let s = STYLE.replace_all(&s, "");
274    let s = NONCE.replace_all(&s, "");
275    let s = CSRF.replace_all(&s, "");
276    let s = WHITESPACE.replace_all(&s, " ");
277    s.trim().to_string()
278}
279
280/// Build a word-frequency map for diff computation.
281#[cfg(unix)]
282fn word_counts(s: &str) -> std::collections::HashMap<&str, usize> {
283    let mut counts = std::collections::HashMap::new();
284    for word in s.split_whitespace() {
285        *counts.entry(word).or_insert(0) += 1;
286    }
287    counts
288}
289
290/// Generate a human-readable summary of word-level differences between two texts.
291/// Shows words present in one response but not the other (capped at 500 chars).
292#[cfg(unix)]
293fn generate_diff_text(baseline: &str, other: &str) -> String {
294    let counts_a = word_counts(baseline);
295    let counts_b = word_counts(other);
296
297    let mut only_in_baseline = Vec::new();
298    let mut only_in_other = Vec::new();
299
300    for (word, &count_a) in &counts_a {
301        let count_b = counts_b.get(word).copied().unwrap_or(0);
302        if count_a > count_b {
303            only_in_baseline.push(*word);
304        }
305    }
306
307    for (word, &count_b) in &counts_b {
308        let count_a = counts_a.get(word).copied().unwrap_or(0);
309        if count_b > count_a {
310            only_in_other.push(*word);
311        }
312    }
313
314    let mut result = String::new();
315    if !only_in_baseline.is_empty() {
316        result.push_str("Only in baseline (chrome): ");
317        let preview: String = only_in_baseline
318            .iter()
319            .take(20)
320            .copied()
321            .collect::<Vec<_>>()
322            .join(" ");
323        result.push_str(&preview);
324        if only_in_baseline.len() > 20 {
325            result.push_str(&format!(" ... (+{} more)", only_in_baseline.len() - 20));
326        }
327    }
328    if !only_in_other.is_empty() {
329        if !result.is_empty() {
330            result.push_str(" | ");
331        }
332        result.push_str("Only in this agent: ");
333        let preview: String = only_in_other
334            .iter()
335            .take(20)
336            .copied()
337            .collect::<Vec<_>>()
338            .join(" ");
339        result.push_str(&preview);
340        if only_in_other.len() > 20 {
341            result.push_str(&format!(" ... (+{} more)", only_in_other.len() - 20));
342        }
343    }
344
345    // Char-safe truncation — slicing on a byte boundary inside a UTF-8 codepoint panics.
346    if result.len() > 500 {
347        let truncated: String = result.chars().take(497).collect();
348        result = format!("{truncated}...");
349    }
350    result
351}
352
353/// Simple word-level diff size in characters.
354///
355/// Counts total characters in words that are in one string but not the other.
356/// This is a rough measure — not a proper edit distance, but sufficient for
357/// detecting meaningful content differences vs. cosmetic variations.
358#[cfg(unix)]
359fn word_diff_size(a: &str, b: &str) -> usize {
360    let counts_a = word_counts(a);
361    let counts_b = word_counts(b);
362
363    let mut diff = 0usize;
364
365    for (word, &count_a) in &counts_a {
366        let count_b = counts_b.get(word).copied().unwrap_or(0);
367        if count_a > count_b {
368            diff += word.len() * (count_a - count_b);
369        }
370    }
371
372    for (word, &count_b) in &counts_b {
373        let count_a = counts_a.get(word).copied().unwrap_or(0);
374        if count_b > count_a {
375            diff += word.len() * (count_b - count_a);
376        }
377    }
378
379    diff
380}
381
382#[cfg(test)]
383#[cfg(unix)]
384mod tests {
385    use super::*;
386
387    #[test]
388    fn test_normalize_html_strips_scripts() {
389        let input = "<html><script>var x = 1;</script><body>Hello</body></html>";
390        let normalized = normalize_html(input);
391        assert!(!normalized.contains("var x"));
392        assert!(normalized.contains("Hello"));
393    }
394
395    #[test]
396    fn test_normalize_html_strips_styles() {
397        let input = "<html><style>.hidden { display:none }</style><body>Hello</body></html>";
398        let normalized = normalize_html(input);
399        assert!(!normalized.contains("display:none"));
400        assert!(normalized.contains("Hello"));
401    }
402
403    #[test]
404    fn test_normalize_html_strips_nonces() {
405        // Test on a non-script element — the SCRIPT regex would otherwise strip the
406        // whole `<script>` tag before NONCE runs and the assertion would pass vacuously.
407        let input = r#"<div nonce="abc123">Content</div><p>More</p>"#;
408        let normalized = normalize_html(input);
409        assert!(
410            !normalized.contains("nonce"),
411            "nonce attribute should be stripped: {normalized}"
412        );
413        assert!(normalized.contains("Content"));
414    }
415
416    #[test]
417    fn test_word_diff_size_identical() {
418        assert_eq!(word_diff_size("hello world", "hello world"), 0);
419    }
420
421    #[test]
422    fn test_word_diff_size_different() {
423        let diff = word_diff_size("hello world", "hello planet");
424        assert!(diff > 0, "different words should produce non-zero diff");
425    }
426
427    #[test]
428    fn test_word_diff_size_threshold() {
429        let diff = word_diff_size("Welcome to our site today", "Welcome to our site");
430        assert!(diff <= 10, "minor diff should be <=10 chars, got {diff}");
431    }
432
433    #[test]
434    fn test_word_diff_size_large_difference() {
435        let a = "Welcome to our website. We offer great products and services.";
436        let b = "Access denied. This content is not available for automated crawlers.";
437        let diff = word_diff_size(a, b);
438        assert!(
439            diff > 10,
440            "significant content difference should exceed threshold, got {diff}"
441        );
442    }
443
444    #[test]
445    fn test_cloaking_rejects_localhost_target_before_fetch() {
446        match check("http://localhost/") {
447            Ok(_) => panic!("expected localhost target to be rejected"),
448            Err(err) => assert!(err.contains("localhost")),
449        }
450    }
451}