1#[cfg(unix)]
6use crate::verdict::{Evidence, Finding, RuleId, Severity};
7
8#[cfg(unix)]
10const USER_AGENTS: &[(&str, &str)] = &[
11 ("chrome", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"),
12 ("claudebot", "ClaudeBot/1.0"),
13 ("chatgpt", "ChatGPT-User"),
14 ("perplexity", "PerplexityBot/1.0"),
15 ("googlebot", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"),
16 ("curl", "curl/8.7.1"),
17];
18
19#[cfg(unix)]
21pub struct CloakingResult {
22 pub url: String,
23 pub cloaking_detected: bool,
24 pub findings: Vec<Finding>,
25 pub agent_responses: Vec<AgentResponse>,
27 pub diff_pairs: Vec<DiffPair>,
29}
30
31#[cfg(unix)]
32pub struct AgentResponse {
33 pub agent_name: String,
34 pub status_code: u16,
35 pub content_length: usize,
36}
37
38#[cfg(unix)]
39pub struct DiffPair {
40 pub agent_a: String,
41 pub agent_b: String,
42 pub diff_chars: usize,
43 pub diff_text: Option<String>,
45}
46
47#[cfg(unix)]
48impl CloakingResult {
49 pub fn to_json(&self, include_diff_text: bool) -> serde_json::Value {
52 serde_json::json!({
53 "url": self.url,
54 "cloaking_detected": self.cloaking_detected,
55 "agents": self.agent_responses.iter().map(|a| {
56 serde_json::json!({
57 "agent": a.agent_name,
58 "status_code": a.status_code,
59 "content_length": a.content_length,
60 })
61 }).collect::<Vec<_>>(),
62 "diffs": self.diff_pairs.iter().map(|d| {
63 let mut entry = serde_json::json!({
64 "agent_a": d.agent_a,
65 "agent_b": d.agent_b,
66 "diff_chars": d.diff_chars,
67 });
68 if include_diff_text {
69 if let Some(ref text) = d.diff_text {
70 entry.as_object_mut().unwrap().insert(
71 "diff_text".into(),
72 serde_json::json!(text),
73 );
74 }
75 }
76 entry
77 }).collect::<Vec<_>>(),
78 "findings": self.findings,
79 })
80 }
81}
82
83#[cfg(unix)]
85pub fn check(url: &str) -> Result<CloakingResult, String> {
86 let validated_url = crate::url_validate::validate_fetch_url(url)?;
87 let client = reqwest::blocking::Client::builder()
88 .timeout(std::time::Duration::from_secs(30))
89 .redirect(reqwest::redirect::Policy::custom(|attempt| {
90 if attempt.previous().len() > 10 {
91 attempt.error("too many redirects")
92 } else if let Err(reason) =
93 crate::url_validate::validate_fetch_url(attempt.url().as_str())
94 {
95 attempt.error(reason)
96 } else {
97 attempt.follow()
98 }
99 }))
100 .build()
101 .map_err(|e| format!("HTTP client error: {e}"))?;
102
103 const MAX_BODY: usize = 10 * 1024 * 1024; let mut responses: Vec<(String, u16, String)> = Vec::new();
106
107 for (name, ua) in USER_AGENTS {
108 match fetch_with_ua(&client, validated_url.as_str(), ua, MAX_BODY) {
109 Ok((status, body)) => {
110 responses.push((name.to_string(), status, body));
111 }
112 Err(e) => {
113 eprintln!("tirith: cloaking: {name} fetch failed: {e}");
114 responses.push((name.to_string(), 0, String::new()));
115 }
116 }
117 }
118
119 let successful_count = responses.iter().filter(|(_, s, _)| *s != 0).count();
120 if successful_count == 0 {
121 return Err("all user-agent fetches failed — cannot perform cloaking analysis".to_string());
122 }
123
124 let baseline_idx = 0;
126 let baseline_body = &responses[baseline_idx].2;
127
128 if baseline_body.is_empty() {
130 let agent_responses: Vec<AgentResponse> = responses
131 .iter()
132 .map(|(name, status, body)| AgentResponse {
133 agent_name: name.clone(),
134 status_code: *status,
135 content_length: body.len(),
136 })
137 .collect();
138 return Ok(CloakingResult {
139 url: url.to_string(),
140 cloaking_detected: false,
141 findings: Vec::new(),
142 agent_responses,
143 diff_pairs: Vec::new(),
144 });
145 }
146
147 let baseline_normalized = normalize_html(baseline_body);
148
149 let mut diff_pairs = Vec::new();
150 let mut cloaking_detected = false;
151
152 let agent_responses: Vec<AgentResponse> = responses
153 .iter()
154 .map(|(name, status, body)| AgentResponse {
155 agent_name: name.clone(),
156 status_code: *status,
157 content_length: body.len(),
158 })
159 .collect();
160
161 for (i, (name, _status, body)) in responses.iter().enumerate() {
162 if i == baseline_idx {
163 continue;
164 }
165 if body.is_empty() {
166 continue;
167 }
168
169 let normalized = normalize_html(body);
170 let diff_chars = word_diff_size(&baseline_normalized, &normalized);
171
172 if diff_chars > 10 {
173 cloaking_detected = true;
174 let diff_detail = generate_diff_text(&baseline_normalized, &normalized);
175 diff_pairs.push(DiffPair {
176 agent_a: "chrome".to_string(),
177 agent_b: name.clone(),
178 diff_chars,
179 diff_text: Some(diff_detail),
180 });
181 }
182 }
183
184 let mut findings = Vec::new();
185 if cloaking_detected {
186 let differing: Vec<&str> = diff_pairs.iter().map(|d| d.agent_b.as_str()).collect();
187 findings.push(Finding {
188 rule_id: RuleId::ServerCloaking,
189 severity: Severity::High,
190 title: "Server-side cloaking detected".to_string(),
191 description: format!(
192 "URL serves different content to different user-agents. \
193 Differing agents: {}",
194 differing.join(", ")
195 ),
196 evidence: diff_pairs
197 .iter()
198 .map(|d| Evidence::Text {
199 detail: format!(
200 "{} vs {}: {} chars different",
201 d.agent_a, d.agent_b, d.diff_chars
202 ),
203 })
204 .collect(),
205 human_view: None,
206 agent_view: None,
207 mitre_id: None,
208 custom_rule_id: None,
209 });
210 }
211
212 Ok(CloakingResult {
213 url: url.to_string(),
214 cloaking_detected,
215 findings,
216 agent_responses,
217 diff_pairs,
218 })
219}
220
221#[cfg(unix)]
222fn fetch_with_ua(
223 client: &reqwest::blocking::Client,
224 url: &str,
225 ua: &str,
226 max_body: usize,
227) -> Result<(u16, String), String> {
228 let response = client
229 .get(url)
230 .header("User-Agent", ua)
231 .send()
232 .map_err(|e| format!("request failed: {e}"))?;
233
234 let status = response.status().as_u16();
235
236 if let Some(len) = response.content_length() {
237 if len > max_body as u64 {
238 return Err(format!("response too large: {len} bytes"));
239 }
240 }
241
242 use std::io::Read as _;
244 let mut body_bytes = Vec::with_capacity(max_body.min(1024 * 1024));
245 response
246 .take((max_body as u64) + 1)
247 .read_to_end(&mut body_bytes)
248 .map_err(|e| format!("read body: {e}"))?;
249 if body_bytes.len() > max_body {
250 return Err(format!("response too large: {} bytes", body_bytes.len()));
251 }
252
253 let body = String::from_utf8_lossy(&body_bytes).into_owned();
254 Ok((status, body))
255}
256
257#[cfg(unix)]
260fn normalize_html(input: &str) -> String {
261 use once_cell::sync::Lazy;
262 use regex::Regex;
263
264 static SCRIPT: Lazy<Regex> =
265 Lazy::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap());
266 static STYLE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap());
267 static NONCE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)\bnonce="[^"]*""#).unwrap());
268 static CSRF: Lazy<Regex> =
269 Lazy::new(|| Regex::new(r#"(?i)<[^>]*csrf[_-]?token[^>]*>"#).unwrap());
270 static WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
271
272 let s = SCRIPT.replace_all(input, "");
273 let s = STYLE.replace_all(&s, "");
274 let s = NONCE.replace_all(&s, "");
275 let s = CSRF.replace_all(&s, "");
276 let s = WHITESPACE.replace_all(&s, " ");
277 s.trim().to_string()
278}
279
280#[cfg(unix)]
282fn word_counts(s: &str) -> std::collections::HashMap<&str, usize> {
283 let mut counts = std::collections::HashMap::new();
284 for word in s.split_whitespace() {
285 *counts.entry(word).or_insert(0) += 1;
286 }
287 counts
288}
289
290#[cfg(unix)]
293fn generate_diff_text(baseline: &str, other: &str) -> String {
294 let counts_a = word_counts(baseline);
295 let counts_b = word_counts(other);
296
297 let mut only_in_baseline = Vec::new();
298 let mut only_in_other = Vec::new();
299
300 for (word, &count_a) in &counts_a {
301 let count_b = counts_b.get(word).copied().unwrap_or(0);
302 if count_a > count_b {
303 only_in_baseline.push(*word);
304 }
305 }
306
307 for (word, &count_b) in &counts_b {
308 let count_a = counts_a.get(word).copied().unwrap_or(0);
309 if count_b > count_a {
310 only_in_other.push(*word);
311 }
312 }
313
314 let mut result = String::new();
315 if !only_in_baseline.is_empty() {
316 result.push_str("Only in baseline (chrome): ");
317 let preview: String = only_in_baseline
318 .iter()
319 .take(20)
320 .copied()
321 .collect::<Vec<_>>()
322 .join(" ");
323 result.push_str(&preview);
324 if only_in_baseline.len() > 20 {
325 result.push_str(&format!(" ... (+{} more)", only_in_baseline.len() - 20));
326 }
327 }
328 if !only_in_other.is_empty() {
329 if !result.is_empty() {
330 result.push_str(" | ");
331 }
332 result.push_str("Only in this agent: ");
333 let preview: String = only_in_other
334 .iter()
335 .take(20)
336 .copied()
337 .collect::<Vec<_>>()
338 .join(" ");
339 result.push_str(&preview);
340 if only_in_other.len() > 20 {
341 result.push_str(&format!(" ... (+{} more)", only_in_other.len() - 20));
342 }
343 }
344
345 if result.len() > 500 {
347 let truncated: String = result.chars().take(497).collect();
348 result = format!("{truncated}...");
349 }
350 result
351}
352
353#[cfg(unix)]
359fn word_diff_size(a: &str, b: &str) -> usize {
360 let counts_a = word_counts(a);
361 let counts_b = word_counts(b);
362
363 let mut diff = 0usize;
364
365 for (word, &count_a) in &counts_a {
366 let count_b = counts_b.get(word).copied().unwrap_or(0);
367 if count_a > count_b {
368 diff += word.len() * (count_a - count_b);
369 }
370 }
371
372 for (word, &count_b) in &counts_b {
373 let count_a = counts_a.get(word).copied().unwrap_or(0);
374 if count_b > count_a {
375 diff += word.len() * (count_b - count_a);
376 }
377 }
378
379 diff
380}
381
382#[cfg(test)]
383#[cfg(unix)]
384mod tests {
385 use super::*;
386
387 #[test]
388 fn test_normalize_html_strips_scripts() {
389 let input = "<html><script>var x = 1;</script><body>Hello</body></html>";
390 let normalized = normalize_html(input);
391 assert!(!normalized.contains("var x"));
392 assert!(normalized.contains("Hello"));
393 }
394
395 #[test]
396 fn test_normalize_html_strips_styles() {
397 let input = "<html><style>.hidden { display:none }</style><body>Hello</body></html>";
398 let normalized = normalize_html(input);
399 assert!(!normalized.contains("display:none"));
400 assert!(normalized.contains("Hello"));
401 }
402
403 #[test]
404 fn test_normalize_html_strips_nonces() {
405 let input = r#"<div nonce="abc123">Content</div><p>More</p>"#;
408 let normalized = normalize_html(input);
409 assert!(
410 !normalized.contains("nonce"),
411 "nonce attribute should be stripped: {normalized}"
412 );
413 assert!(normalized.contains("Content"));
414 }
415
416 #[test]
417 fn test_word_diff_size_identical() {
418 assert_eq!(word_diff_size("hello world", "hello world"), 0);
419 }
420
421 #[test]
422 fn test_word_diff_size_different() {
423 let diff = word_diff_size("hello world", "hello planet");
424 assert!(diff > 0, "different words should produce non-zero diff");
425 }
426
427 #[test]
428 fn test_word_diff_size_threshold() {
429 let diff = word_diff_size("Welcome to our site today", "Welcome to our site");
430 assert!(diff <= 10, "minor diff should be <=10 chars, got {diff}");
431 }
432
433 #[test]
434 fn test_word_diff_size_large_difference() {
435 let a = "Welcome to our website. We offer great products and services.";
436 let b = "Access denied. This content is not available for automated crawlers.";
437 let diff = word_diff_size(a, b);
438 assert!(
439 diff > 10,
440 "significant content difference should exceed threshold, got {diff}"
441 );
442 }
443
444 #[test]
445 fn test_cloaking_rejects_localhost_target_before_fetch() {
446 match check("http://localhost/") {
447 Ok(_) => panic!("expected localhost target to be rejected"),
448 Err(err) => assert!(err.contains("localhost")),
449 }
450 }
451}