tirith_core/rules/
cloaking.rs

1/// Server-side cloaking detection — Unix only.
2///
3/// Fetches a URL with multiple user-agents and compares responses to detect
4/// content differentiation (serving different content to AI bots vs browsers).
5#[cfg(unix)]
6use crate::verdict::{Evidence, Finding, RuleId, Severity};
7
8/// User-agent profiles for cloaking detection.
9#[cfg(unix)]
10const USER_AGENTS: &[(&str, &str)] = &[
11    ("chrome", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"),
12    ("claudebot", "ClaudeBot/1.0"),
13    ("chatgpt", "ChatGPT-User"),
14    ("perplexity", "PerplexityBot/1.0"),
15    ("googlebot", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"),
16    ("curl", "curl/8.7.1"),
17];
18
19/// Result of a cloaking check.
20#[cfg(unix)]
21pub struct CloakingResult {
22    pub url: String,
23    pub cloaking_detected: bool,
24    pub findings: Vec<Finding>,
25    /// Per-agent response summaries (agent name, status code, content length).
26    pub agent_responses: Vec<AgentResponse>,
27    /// Pairs of agents whose responses differed significantly.
28    pub diff_pairs: Vec<DiffPair>,
29}
30
31#[cfg(unix)]
32pub struct AgentResponse {
33    pub agent_name: String,
34    pub status_code: u16,
35    pub content_length: usize,
36}
37
38#[cfg(unix)]
39pub struct DiffPair {
40    pub agent_a: String,
41    pub agent_b: String,
42    pub diff_chars: usize,
43    /// Full diff text (populated for Pro enrichment).
44    pub diff_text: Option<String>,
45}
46
47#[cfg(unix)]
48impl CloakingResult {
49    /// Serialize to JSON. When `include_diff_text` is true (Pro tier), diff text
50    /// is included in the output; otherwise it is omitted.
51    pub fn to_json(&self, include_diff_text: bool) -> serde_json::Value {
52        serde_json::json!({
53            "url": self.url,
54            "cloaking_detected": self.cloaking_detected,
55            "agents": self.agent_responses.iter().map(|a| {
56                serde_json::json!({
57                    "agent": a.agent_name,
58                    "status_code": a.status_code,
59                    "content_length": a.content_length,
60                })
61            }).collect::<Vec<_>>(),
62            "diffs": self.diff_pairs.iter().map(|d| {
63                let mut entry = serde_json::json!({
64                    "agent_a": d.agent_a,
65                    "agent_b": d.agent_b,
66                    "diff_chars": d.diff_chars,
67                });
68                if include_diff_text {
69                    if let Some(ref text) = d.diff_text {
70                        entry.as_object_mut().unwrap().insert(
71                            "diff_text".into(),
72                            serde_json::json!(text),
73                        );
74                    }
75                }
76                entry
77            }).collect::<Vec<_>>(),
78            "findings": self.findings,
79        })
80    }
81}
82
83/// Check a URL for server-side cloaking.
84#[cfg(unix)]
85pub fn check(url: &str) -> Result<CloakingResult, String> {
86    let validated_url = crate::url_validate::validate_fetch_url(url)?;
87    let client = reqwest::blocking::Client::builder()
88        .timeout(std::time::Duration::from_secs(30))
89        .redirect(reqwest::redirect::Policy::custom(|attempt| {
90            if attempt.previous().len() > 10 {
91                attempt.error("too many redirects")
92            } else if let Err(reason) =
93                crate::url_validate::validate_fetch_url(attempt.url().as_str())
94            {
95                attempt.error(reason)
96            } else {
97                attempt.follow()
98            }
99        }))
100        .build()
101        .map_err(|e| format!("HTTP client error: {e}"))?;
102
103    const MAX_BODY: usize = 10 * 1024 * 1024; // 10 MiB
104
105    // Fetch with each user-agent
106    let mut responses: Vec<(String, u16, String)> = Vec::new();
107
108    for (name, ua) in USER_AGENTS {
109        match fetch_with_ua(&client, validated_url.as_str(), ua, MAX_BODY) {
110            Ok((status, body)) => {
111                responses.push((name.to_string(), status, body));
112            }
113            Err(e) => {
114                eprintln!("tirith: cloaking: {name} fetch failed: {e}");
115                responses.push((name.to_string(), 0, String::new()));
116            }
117        }
118    }
119
120    // Check if all fetches failed
121    let successful_count = responses.iter().filter(|(_, s, _)| *s != 0).count();
122    if successful_count == 0 {
123        return Err("all user-agent fetches failed — cannot perform cloaking analysis".to_string());
124    }
125
126    // Use chrome as baseline
127    let baseline_idx = 0; // chrome is first
128    let baseline_body = &responses[baseline_idx].2;
129
130    // If baseline fetch failed (empty body), we cannot reliably compare — return no findings
131    // rather than false-flagging every non-empty response as cloaking.
132    if baseline_body.is_empty() {
133        let agent_responses: Vec<AgentResponse> = responses
134            .iter()
135            .map(|(name, status, body)| AgentResponse {
136                agent_name: name.clone(),
137                status_code: *status,
138                content_length: body.len(),
139            })
140            .collect();
141        return Ok(CloakingResult {
142            url: url.to_string(),
143            cloaking_detected: false,
144            findings: Vec::new(),
145            agent_responses,
146            diff_pairs: Vec::new(),
147        });
148    }
149
150    let baseline_normalized = normalize_html(baseline_body);
151
152    let mut diff_pairs = Vec::new();
153    let mut cloaking_detected = false;
154
155    let agent_responses: Vec<AgentResponse> = responses
156        .iter()
157        .map(|(name, status, body)| AgentResponse {
158            agent_name: name.clone(),
159            status_code: *status,
160            content_length: body.len(),
161        })
162        .collect();
163
164    // Compare each non-baseline response against chrome baseline
165    for (i, (name, _status, body)) in responses.iter().enumerate() {
166        if i == baseline_idx {
167            continue;
168        }
169        if body.is_empty() {
170            continue; // Skip failed fetches
171        }
172
173        let normalized = normalize_html(body);
174        let diff_chars = word_diff_size(&baseline_normalized, &normalized);
175
176        if diff_chars > 10 {
177            cloaking_detected = true;
178            // Generate diff text showing what words differ
179            let diff_detail = generate_diff_text(&baseline_normalized, &normalized);
180            diff_pairs.push(DiffPair {
181                agent_a: "chrome".to_string(),
182                agent_b: name.clone(),
183                diff_chars,
184                diff_text: Some(diff_detail),
185            });
186        }
187    }
188
189    let mut findings = Vec::new();
190    if cloaking_detected {
191        let differing: Vec<&str> = diff_pairs.iter().map(|d| d.agent_b.as_str()).collect();
192        findings.push(Finding {
193            rule_id: RuleId::ServerCloaking,
194            severity: Severity::High,
195            title: "Server-side cloaking detected".to_string(),
196            description: format!(
197                "URL serves different content to different user-agents. \
198                 Differing agents: {}",
199                differing.join(", ")
200            ),
201            evidence: diff_pairs
202                .iter()
203                .map(|d| Evidence::Text {
204                    detail: format!(
205                        "{} vs {}: {} chars different",
206                        d.agent_a, d.agent_b, d.diff_chars
207                    ),
208                })
209                .collect(),
210            human_view: None,
211            agent_view: None,
212            mitre_id: None,
213            custom_rule_id: None,
214        });
215    }
216
217    Ok(CloakingResult {
218        url: url.to_string(),
219        cloaking_detected,
220        findings,
221        agent_responses,
222        diff_pairs,
223    })
224}
225
226#[cfg(unix)]
227fn fetch_with_ua(
228    client: &reqwest::blocking::Client,
229    url: &str,
230    ua: &str,
231    max_body: usize,
232) -> Result<(u16, String), String> {
233    let response = client
234        .get(url)
235        .header("User-Agent", ua)
236        .send()
237        .map_err(|e| format!("request failed: {e}"))?;
238
239    let status = response.status().as_u16();
240
241    // Check content length hint
242    if let Some(len) = response.content_length() {
243        if len > max_body as u64 {
244            return Err(format!("response too large: {len} bytes"));
245        }
246    }
247
248    // Read body with size limit to prevent OOM from servers without Content-Length
249    use std::io::Read as _;
250    let mut body_bytes = Vec::with_capacity(max_body.min(1024 * 1024));
251    response
252        .take((max_body as u64) + 1)
253        .read_to_end(&mut body_bytes)
254        .map_err(|e| format!("read body: {e}"))?;
255    if body_bytes.len() > max_body {
256        return Err(format!("response too large: {} bytes", body_bytes.len()));
257    }
258
259    let body = String::from_utf8_lossy(&body_bytes).into_owned();
260    Ok((status, body))
261}
262
263/// Normalize HTML for comparison — strip volatile content that changes
264/// between requests (scripts, styles, CSRF tokens, nonces, timestamps).
265#[cfg(unix)]
266fn normalize_html(input: &str) -> String {
267    use once_cell::sync::Lazy;
268    use regex::Regex;
269
270    static SCRIPT: Lazy<Regex> =
271        Lazy::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap());
272    static STYLE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap());
273    static NONCE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)\bnonce="[^"]*""#).unwrap());
274    static CSRF: Lazy<Regex> =
275        Lazy::new(|| Regex::new(r#"(?i)<[^>]*csrf[_-]?token[^>]*>"#).unwrap());
276    static WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
277
278    let s = SCRIPT.replace_all(input, "");
279    let s = STYLE.replace_all(&s, "");
280    let s = NONCE.replace_all(&s, "");
281    let s = CSRF.replace_all(&s, "");
282    let s = WHITESPACE.replace_all(&s, " ");
283    s.trim().to_string()
284}
285
286/// Build a word-frequency map for diff computation.
287#[cfg(unix)]
288fn word_counts(s: &str) -> std::collections::HashMap<&str, usize> {
289    let mut counts = std::collections::HashMap::new();
290    for word in s.split_whitespace() {
291        *counts.entry(word).or_insert(0) += 1;
292    }
293    counts
294}
295
296/// Generate a human-readable summary of word-level differences between two texts.
297/// Shows words present in one response but not the other (capped at 500 chars).
298#[cfg(unix)]
299fn generate_diff_text(baseline: &str, other: &str) -> String {
300    let counts_a = word_counts(baseline);
301    let counts_b = word_counts(other);
302
303    let mut only_in_baseline = Vec::new();
304    let mut only_in_other = Vec::new();
305
306    for (word, &count_a) in &counts_a {
307        let count_b = counts_b.get(word).copied().unwrap_or(0);
308        if count_a > count_b {
309            only_in_baseline.push(*word);
310        }
311    }
312
313    for (word, &count_b) in &counts_b {
314        let count_a = counts_a.get(word).copied().unwrap_or(0);
315        if count_b > count_a {
316            only_in_other.push(*word);
317        }
318    }
319
320    let mut result = String::new();
321    if !only_in_baseline.is_empty() {
322        result.push_str("Only in baseline (chrome): ");
323        let preview: String = only_in_baseline
324            .iter()
325            .take(20)
326            .copied()
327            .collect::<Vec<_>>()
328            .join(" ");
329        result.push_str(&preview);
330        if only_in_baseline.len() > 20 {
331            result.push_str(&format!(" ... (+{} more)", only_in_baseline.len() - 20));
332        }
333    }
334    if !only_in_other.is_empty() {
335        if !result.is_empty() {
336            result.push_str(" | ");
337        }
338        result.push_str("Only in this agent: ");
339        let preview: String = only_in_other
340            .iter()
341            .take(20)
342            .copied()
343            .collect::<Vec<_>>()
344            .join(" ");
345        result.push_str(&preview);
346        if only_in_other.len() > 20 {
347            result.push_str(&format!(" ... (+{} more)", only_in_other.len() - 20));
348        }
349    }
350
351    // Char-safe truncation (avoid panic on multibyte boundary)
352    if result.len() > 500 {
353        let truncated: String = result.chars().take(497).collect();
354        result = format!("{truncated}...");
355    }
356    result
357}
358
359/// Simple word-level diff size in characters.
360///
361/// Counts total characters in words that are in one string but not the other.
362/// This is a rough measure — not a proper edit distance, but sufficient for
363/// detecting meaningful content differences vs. cosmetic variations.
364#[cfg(unix)]
365fn word_diff_size(a: &str, b: &str) -> usize {
366    let counts_a = word_counts(a);
367    let counts_b = word_counts(b);
368
369    let mut diff = 0usize;
370
371    // Words in A not in B (or fewer in B)
372    for (word, &count_a) in &counts_a {
373        let count_b = counts_b.get(word).copied().unwrap_or(0);
374        if count_a > count_b {
375            diff += word.len() * (count_a - count_b);
376        }
377    }
378
379    // Words in B not in A (or fewer in A)
380    for (word, &count_b) in &counts_b {
381        let count_a = counts_a.get(word).copied().unwrap_or(0);
382        if count_b > count_a {
383            diff += word.len() * (count_b - count_a);
384        }
385    }
386
387    diff
388}
389
390#[cfg(test)]
391#[cfg(unix)]
392mod tests {
393    use super::*;
394
395    #[test]
396    fn test_normalize_html_strips_scripts() {
397        let input = "<html><script>var x = 1;</script><body>Hello</body></html>";
398        let normalized = normalize_html(input);
399        assert!(!normalized.contains("var x"));
400        assert!(normalized.contains("Hello"));
401    }
402
403    #[test]
404    fn test_normalize_html_strips_styles() {
405        let input = "<html><style>.hidden { display:none }</style><body>Hello</body></html>";
406        let normalized = normalize_html(input);
407        assert!(!normalized.contains("display:none"));
408        assert!(normalized.contains("Hello"));
409    }
410
411    #[test]
412    fn test_normalize_html_strips_nonces() {
413        // Use a non-script element so the NONCE regex is actually exercised
414        // (the SCRIPT regex would strip the entire <script> tag before NONCE runs).
415        let input = r#"<div nonce="abc123">Content</div><p>More</p>"#;
416        let normalized = normalize_html(input);
417        assert!(
418            !normalized.contains("nonce"),
419            "nonce attribute should be stripped: {normalized}"
420        );
421        assert!(normalized.contains("Content"));
422    }
423
424    #[test]
425    fn test_word_diff_size_identical() {
426        assert_eq!(word_diff_size("hello world", "hello world"), 0);
427    }
428
429    #[test]
430    fn test_word_diff_size_different() {
431        let diff = word_diff_size("hello world", "hello planet");
432        assert!(diff > 0, "different words should produce non-zero diff");
433    }
434
435    #[test]
436    fn test_word_diff_size_threshold() {
437        // Small cosmetic difference (single word)
438        let diff = word_diff_size("Welcome to our site today", "Welcome to our site");
439        assert!(diff <= 10, "minor diff should be <=10 chars, got {diff}");
440    }
441
442    #[test]
443    fn test_word_diff_size_large_difference() {
444        let a = "Welcome to our website. We offer great products and services.";
445        let b = "Access denied. This content is not available for automated crawlers.";
446        let diff = word_diff_size(a, b);
447        assert!(
448            diff > 10,
449            "significant content difference should exceed threshold, got {diff}"
450        );
451    }
452
453    #[test]
454    fn test_cloaking_rejects_localhost_target_before_fetch() {
455        match check("http://localhost/") {
456            Ok(_) => panic!("expected localhost target to be rejected"),
457            Err(err) => assert!(err.contains("localhost")),
458        }
459    }
460}
tirith_core/rules/cloaking.rs

tirith_core/rules/
cloaking.rs