tirith-core 0.2.11

Terminal security analysis engine - homograph attacks, pipe-to-shell, ANSI injection
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
/// Server-side cloaking detection — Unix only.
///
/// Fetches a URL with multiple user-agents and compares responses to detect
/// content differentiation (serving different content to AI bots vs browsers).
#[cfg(unix)]
use crate::verdict::{Evidence, Finding, RuleId, Severity};

/// User-agent profiles for cloaking detection.
#[cfg(unix)]
const USER_AGENTS: &[(&str, &str)] = &[
    ("chrome", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"),
    ("claudebot", "ClaudeBot/1.0"),
    ("chatgpt", "ChatGPT-User"),
    ("perplexity", "PerplexityBot/1.0"),
    ("googlebot", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"),
    ("curl", "curl/8.7.1"),
];

/// Result of a cloaking check.
#[cfg(unix)]
pub struct CloakingResult {
    pub url: String,
    pub cloaking_detected: bool,
    pub findings: Vec<Finding>,
    /// Per-agent response summaries (agent name, status code, content length).
    pub agent_responses: Vec<AgentResponse>,
    /// Pairs of agents whose responses differed significantly.
    pub diff_pairs: Vec<DiffPair>,
}

#[cfg(unix)]
pub struct AgentResponse {
    pub agent_name: String,
    pub status_code: u16,
    pub content_length: usize,
}

#[cfg(unix)]
pub struct DiffPair {
    pub agent_a: String,
    pub agent_b: String,
    pub diff_chars: usize,
    /// Full diff text (populated for Pro enrichment).
    pub diff_text: Option<String>,
}

#[cfg(unix)]
impl CloakingResult {
    /// Serialize to JSON. When `include_diff_text` is true (Pro tier), diff text
    /// is included in the output; otherwise it is omitted.
    pub fn to_json(&self, include_diff_text: bool) -> serde_json::Value {
        serde_json::json!({
            "url": self.url,
            "cloaking_detected": self.cloaking_detected,
            "agents": self.agent_responses.iter().map(|a| {
                serde_json::json!({
                    "agent": a.agent_name,
                    "status_code": a.status_code,
                    "content_length": a.content_length,
                })
            }).collect::<Vec<_>>(),
            "diffs": self.diff_pairs.iter().map(|d| {
                let mut entry = serde_json::json!({
                    "agent_a": d.agent_a,
                    "agent_b": d.agent_b,
                    "diff_chars": d.diff_chars,
                });
                if include_diff_text {
                    if let Some(ref text) = d.diff_text {
                        entry.as_object_mut().unwrap().insert(
                            "diff_text".into(),
                            serde_json::json!(text),
                        );
                    }
                }
                entry
            }).collect::<Vec<_>>(),
            "findings": self.findings,
        })
    }
}

/// Check a URL for server-side cloaking.
#[cfg(unix)]
pub fn check(url: &str) -> Result<CloakingResult, String> {
    let validated_url = crate::url_validate::validate_fetch_url(url)?;
    let client = reqwest::blocking::Client::builder()
        .timeout(std::time::Duration::from_secs(30))
        .redirect(reqwest::redirect::Policy::custom(|attempt| {
            if attempt.previous().len() > 10 {
                attempt.error("too many redirects")
            } else if let Err(reason) =
                crate::url_validate::validate_fetch_url(attempt.url().as_str())
            {
                attempt.error(reason)
            } else {
                attempt.follow()
            }
        }))
        .build()
        .map_err(|e| format!("HTTP client error: {e}"))?;

    const MAX_BODY: usize = 10 * 1024 * 1024; // 10 MiB

    // Fetch with each user-agent
    let mut responses: Vec<(String, u16, String)> = Vec::new();

    for (name, ua) in USER_AGENTS {
        match fetch_with_ua(&client, validated_url.as_str(), ua, MAX_BODY) {
            Ok((status, body)) => {
                responses.push((name.to_string(), status, body));
            }
            Err(e) => {
                eprintln!("tirith: cloaking: {name} fetch failed: {e}");
                responses.push((name.to_string(), 0, String::new()));
            }
        }
    }

    // Check if all fetches failed
    let successful_count = responses.iter().filter(|(_, s, _)| *s != 0).count();
    if successful_count == 0 {
        return Err("all user-agent fetches failed — cannot perform cloaking analysis".to_string());
    }

    // Use chrome as baseline
    let baseline_idx = 0; // chrome is first
    let baseline_body = &responses[baseline_idx].2;

    // If baseline fetch failed (empty body), we cannot reliably compare — return no findings
    // rather than false-flagging every non-empty response as cloaking.
    if baseline_body.is_empty() {
        let agent_responses: Vec<AgentResponse> = responses
            .iter()
            .map(|(name, status, body)| AgentResponse {
                agent_name: name.clone(),
                status_code: *status,
                content_length: body.len(),
            })
            .collect();
        return Ok(CloakingResult {
            url: url.to_string(),
            cloaking_detected: false,
            findings: Vec::new(),
            agent_responses,
            diff_pairs: Vec::new(),
        });
    }

    let baseline_normalized = normalize_html(baseline_body);

    let mut diff_pairs = Vec::new();
    let mut cloaking_detected = false;

    let agent_responses: Vec<AgentResponse> = responses
        .iter()
        .map(|(name, status, body)| AgentResponse {
            agent_name: name.clone(),
            status_code: *status,
            content_length: body.len(),
        })
        .collect();

    // Compare each non-baseline response against chrome baseline
    for (i, (name, _status, body)) in responses.iter().enumerate() {
        if i == baseline_idx {
            continue;
        }
        if body.is_empty() {
            continue; // Skip failed fetches
        }

        let normalized = normalize_html(body);
        let diff_chars = word_diff_size(&baseline_normalized, &normalized);

        if diff_chars > 10 {
            cloaking_detected = true;
            // Generate diff text showing what words differ
            let diff_detail = generate_diff_text(&baseline_normalized, &normalized);
            diff_pairs.push(DiffPair {
                agent_a: "chrome".to_string(),
                agent_b: name.clone(),
                diff_chars,
                diff_text: Some(diff_detail),
            });
        }
    }

    let mut findings = Vec::new();
    if cloaking_detected {
        let differing: Vec<&str> = diff_pairs.iter().map(|d| d.agent_b.as_str()).collect();
        findings.push(Finding {
            rule_id: RuleId::ServerCloaking,
            severity: Severity::High,
            title: "Server-side cloaking detected".to_string(),
            description: format!(
                "URL serves different content to different user-agents. \
                 Differing agents: {}",
                differing.join(", ")
            ),
            evidence: diff_pairs
                .iter()
                .map(|d| Evidence::Text {
                    detail: format!(
                        "{} vs {}: {} chars different",
                        d.agent_a, d.agent_b, d.diff_chars
                    ),
                })
                .collect(),
            human_view: None,
            agent_view: None,
            mitre_id: None,
            custom_rule_id: None,
        });
    }

    Ok(CloakingResult {
        url: url.to_string(),
        cloaking_detected,
        findings,
        agent_responses,
        diff_pairs,
    })
}

#[cfg(unix)]
fn fetch_with_ua(
    client: &reqwest::blocking::Client,
    url: &str,
    ua: &str,
    max_body: usize,
) -> Result<(u16, String), String> {
    let response = client
        .get(url)
        .header("User-Agent", ua)
        .send()
        .map_err(|e| format!("request failed: {e}"))?;

    let status = response.status().as_u16();

    // Check content length hint
    if let Some(len) = response.content_length() {
        if len > max_body as u64 {
            return Err(format!("response too large: {len} bytes"));
        }
    }

    // Read body with size limit to prevent OOM from servers without Content-Length
    use std::io::Read as _;
    let mut body_bytes = Vec::with_capacity(max_body.min(1024 * 1024));
    response
        .take((max_body as u64) + 1)
        .read_to_end(&mut body_bytes)
        .map_err(|e| format!("read body: {e}"))?;
    if body_bytes.len() > max_body {
        return Err(format!("response too large: {} bytes", body_bytes.len()));
    }

    let body = String::from_utf8_lossy(&body_bytes).into_owned();
    Ok((status, body))
}

/// Normalize HTML for comparison — strip volatile content that changes
/// between requests (scripts, styles, CSRF tokens, nonces, timestamps).
#[cfg(unix)]
fn normalize_html(input: &str) -> String {
    use once_cell::sync::Lazy;
    use regex::Regex;

    static SCRIPT: Lazy<Regex> =
        Lazy::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap());
    static STYLE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap());
    static NONCE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)\bnonce="[^"]*""#).unwrap());
    static CSRF: Lazy<Regex> =
        Lazy::new(|| Regex::new(r#"(?i)<[^>]*csrf[_-]?token[^>]*>"#).unwrap());
    static WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());

    let s = SCRIPT.replace_all(input, "");
    let s = STYLE.replace_all(&s, "");
    let s = NONCE.replace_all(&s, "");
    let s = CSRF.replace_all(&s, "");
    let s = WHITESPACE.replace_all(&s, " ");
    s.trim().to_string()
}

/// Build a word-frequency map for diff computation.
#[cfg(unix)]
fn word_counts(s: &str) -> std::collections::HashMap<&str, usize> {
    let mut counts = std::collections::HashMap::new();
    for word in s.split_whitespace() {
        *counts.entry(word).or_insert(0) += 1;
    }
    counts
}

/// Generate a human-readable summary of word-level differences between two texts.
/// Shows words present in one response but not the other (capped at 500 chars).
#[cfg(unix)]
fn generate_diff_text(baseline: &str, other: &str) -> String {
    let counts_a = word_counts(baseline);
    let counts_b = word_counts(other);

    let mut only_in_baseline = Vec::new();
    let mut only_in_other = Vec::new();

    for (word, &count_a) in &counts_a {
        let count_b = counts_b.get(word).copied().unwrap_or(0);
        if count_a > count_b {
            only_in_baseline.push(*word);
        }
    }

    for (word, &count_b) in &counts_b {
        let count_a = counts_a.get(word).copied().unwrap_or(0);
        if count_b > count_a {
            only_in_other.push(*word);
        }
    }

    let mut result = String::new();
    if !only_in_baseline.is_empty() {
        result.push_str("Only in baseline (chrome): ");
        let preview: String = only_in_baseline
            .iter()
            .take(20)
            .copied()
            .collect::<Vec<_>>()
            .join(" ");
        result.push_str(&preview);
        if only_in_baseline.len() > 20 {
            result.push_str(&format!(" ... (+{} more)", only_in_baseline.len() - 20));
        }
    }
    if !only_in_other.is_empty() {
        if !result.is_empty() {
            result.push_str(" | ");
        }
        result.push_str("Only in this agent: ");
        let preview: String = only_in_other
            .iter()
            .take(20)
            .copied()
            .collect::<Vec<_>>()
            .join(" ");
        result.push_str(&preview);
        if only_in_other.len() > 20 {
            result.push_str(&format!(" ... (+{} more)", only_in_other.len() - 20));
        }
    }

    // Char-safe truncation (avoid panic on multibyte boundary)
    if result.len() > 500 {
        let truncated: String = result.chars().take(497).collect();
        result = format!("{truncated}...");
    }
    result
}

/// Simple word-level diff size in characters.
///
/// Counts total characters in words that are in one string but not the other.
/// This is a rough measure — not a proper edit distance, but sufficient for
/// detecting meaningful content differences vs. cosmetic variations.
#[cfg(unix)]
fn word_diff_size(a: &str, b: &str) -> usize {
    let counts_a = word_counts(a);
    let counts_b = word_counts(b);

    let mut diff = 0usize;

    // Words in A not in B (or fewer in B)
    for (word, &count_a) in &counts_a {
        let count_b = counts_b.get(word).copied().unwrap_or(0);
        if count_a > count_b {
            diff += word.len() * (count_a - count_b);
        }
    }

    // Words in B not in A (or fewer in A)
    for (word, &count_b) in &counts_b {
        let count_a = counts_a.get(word).copied().unwrap_or(0);
        if count_b > count_a {
            diff += word.len() * (count_b - count_a);
        }
    }

    diff
}

#[cfg(test)]
#[cfg(unix)]
mod tests {
    use super::*;

    #[test]
    fn test_normalize_html_strips_scripts() {
        let input = "<html><script>var x = 1;</script><body>Hello</body></html>";
        let normalized = normalize_html(input);
        assert!(!normalized.contains("var x"));
        assert!(normalized.contains("Hello"));
    }

    #[test]
    fn test_normalize_html_strips_styles() {
        let input = "<html><style>.hidden { display:none }</style><body>Hello</body></html>";
        let normalized = normalize_html(input);
        assert!(!normalized.contains("display:none"));
        assert!(normalized.contains("Hello"));
    }

    #[test]
    fn test_normalize_html_strips_nonces() {
        // Use a non-script element so the NONCE regex is actually exercised
        // (the SCRIPT regex would strip the entire <script> tag before NONCE runs).
        let input = r#"<div nonce="abc123">Content</div><p>More</p>"#;
        let normalized = normalize_html(input);
        assert!(
            !normalized.contains("nonce"),
            "nonce attribute should be stripped: {normalized}"
        );
        assert!(normalized.contains("Content"));
    }

    #[test]
    fn test_word_diff_size_identical() {
        assert_eq!(word_diff_size("hello world", "hello world"), 0);
    }

    #[test]
    fn test_word_diff_size_different() {
        let diff = word_diff_size("hello world", "hello planet");
        assert!(diff > 0, "different words should produce non-zero diff");
    }

    #[test]
    fn test_word_diff_size_threshold() {
        // Small cosmetic difference (single word)
        let diff = word_diff_size("Welcome to our site today", "Welcome to our site");
        assert!(diff <= 10, "minor diff should be <=10 chars, got {diff}");
    }

    #[test]
    fn test_word_diff_size_large_difference() {
        let a = "Welcome to our website. We offer great products and services.";
        let b = "Access denied. This content is not available for automated crawlers.";
        let diff = word_diff_size(a, b);
        assert!(
            diff > 10,
            "significant content difference should exceed threshold, got {diff}"
        );
    }

    #[test]
    fn test_cloaking_rejects_localhost_target_before_fetch() {
        match check("http://localhost/") {
            Ok(_) => panic!("expected localhost target to be rejected"),
            Err(err) => assert!(err.contains("localhost")),
        }
    }
}