revka 2026.6.22

//! Link enricher: auto-detects URLs in inbound messages, fetches their content,
//! and prepends summaries so the agent has link context without explicit tool calls.

use regex::Regex;
use std::net::IpAddr;
use std::sync::LazyLock;
use std::time::Duration;

/// Configuration for the link enricher pipeline stage.
#[derive(Debug, Clone)]
pub struct LinkEnricherConfig {
    pub enabled: bool,
    pub max_links: usize,
    pub timeout_secs: u64,
}

impl Default for LinkEnricherConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            max_links: 3,
            timeout_secs: 10,
        }
    }
}

/// URL regex: matches http:// and https:// URLs, stopping at whitespace, angle
/// brackets, or double-quotes.
static URL_RE: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r#"https?://[^\s<>"']+"#).expect("URL regex must compile"));

/// Extract URLs from message text, returning up to `max` unique URLs.
pub fn extract_urls(text: &str, max: usize) -> Vec<String> {
    let mut seen = Vec::new();
    for m in URL_RE.find_iter(text) {
        let url = m.as_str().to_string();
        if !seen.contains(&url) {
            seen.push(url);
            if seen.len() >= max {
                break;
            }
        }
    }
    seen
}

/// Returns `true` if the URL points to a private/local address that should be
/// blocked for SSRF protection.
pub fn is_ssrf_target(url: &str) -> bool {
    // Parse with the same URL crate reqwest uses so our host view matches what
    // reqwest will actually connect to. A hand-rolled parser diverges and lets
    // bypasses through: userinfo (`trusted.com@127.0.0.1`) and alternative IP
    // encodings (decimal `2130706433`, hex `0x7f000001`) — both of which the url
    // crate canonicalizes to a literal IP that then skips the connect-time
    // resolver, so they must be caught here.
    let parsed = match reqwest::Url::parse(url) {
        Ok(u) => u,
        Err(_) => return true, // unparseable URLs are rejected
    };
    if !matches!(parsed.scheme(), "http" | "https") {
        return true;
    }
    let host = match parsed.host_str() {
        Some(h) if !h.is_empty() => h.to_ascii_lowercase(),
        _ => return true, // no host -> reject
    };

    // Hostname-based locals.
    if host == "localhost"
        || host.ends_with(".localhost")
        || host.ends_with(".local")
        || host == "local"
    {
        return true;
    }

    // Literal IPs — host_str() gives the canonical form (decimal/hex/octal IPv4
    // normalized); strip IPv6 brackets before parsing. DNS-resolving hostnames
    // are validated at connect time by the SsrfResolver in fetch_link_summary
    // (which also covers redirect hops).
    let bare = host
        .strip_prefix('[')
        .and_then(|h| h.strip_suffix(']'))
        .unwrap_or(&host);
    if let Ok(ip) = bare.parse::<IpAddr>() {
        return crate::security::ssrf::is_non_global_ip(ip);
    }

    false
}

/// Extract the `<title>` tag content from HTML.
pub fn extract_title(html: &str) -> Option<String> {
    // Case-insensitive search for <title>...</title>
    let lower = html.to_lowercase();
    let start = lower.find("<title")? + "<title".len();
    // Skip attributes if any (e.g. <title lang="en">)
    let start = lower[start..].find('>')? + start + 1;
    let end = lower[start..].find("</title")? + start;
    let title = lower[start..end].trim().to_string();
    if title.is_empty() {
        None
    } else {
        Some(html_entity_decode_basic(&title))
    }
}

/// Extract the first `max_chars` of visible body text from HTML.
pub fn extract_body_text(html: &str, max_chars: usize) -> String {
    let text = nanohtml2text::html2text(html);
    let trimmed = text.trim();
    if trimmed.len() <= max_chars {
        trimmed.to_string()
    } else {
        let mut result: String = trimmed.chars().take(max_chars).collect();
        result.push_str("...");
        result
    }
}

/// Basic HTML entity decoding for title content.
fn html_entity_decode_basic(s: &str) -> String {
    s.replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&quot;", "\"")
        .replace("&#39;", "'")
        .replace("&apos;", "'")
}

/// Summary of a fetched link.
struct LinkSummary {
    title: String,
    snippet: String,
}

/// Fetch a single URL and extract a summary. Returns `None` on any failure.
async fn fetch_link_summary(url: &str, timeout_secs: u64) -> Option<LinkSummary> {
    let client = reqwest::Client::builder()
        .timeout(Duration::from_secs(timeout_secs))
        .connect_timeout(Duration::from_secs(5))
        // SSRF defense, two layers — both are required:
        // 1. Per-hop redirect check: reqwest connects to *literal-IP* authorities
        //    WITHOUT invoking the custom dns_resolver below, so a 302 to
        //    http://169.254.169.254/ would otherwise slip past it. Re-run the
        //    string SSRF check on every redirect target (max 5 hops).
        // 2. Connect-time dns_resolver: validates every *resolved* IP (original
        //    host and each hop) against the deny-list, stopping a public
        //    hostname that resolves to an internal address (DNS-rebinding).
        .redirect(reqwest::redirect::Policy::custom(|attempt| {
            if attempt.previous().len() >= 5 {
                return attempt.stop();
            }
            if is_ssrf_target(attempt.url().as_str()) {
                return attempt.error(std::io::Error::new(
                    std::io::ErrorKind::PermissionDenied,
                    "link enricher: blocked redirect to private/SSRF target",
                ));
            }
            attempt.follow()
        }))
        .user_agent("Revka/0.1 (link-enricher)")
        .dns_resolver(std::sync::Arc::new(
            crate::security::ssrf::SsrfResolver::deny_private(),
        ))
        .build()
        .ok()?;

    let response = client.get(url).send().await.ok()?;
    if !response.status().is_success() {
        return None;
    }

    // Only process text/html responses
    let content_type = response
        .headers()
        .get(reqwest::header::CONTENT_TYPE)
        .and_then(|v| v.to_str().ok())
        .unwrap_or("")
        .to_lowercase();

    if !content_type.contains("text/html") && !content_type.is_empty() {
        return None;
    }

    // Read up to 256KB to extract title and snippet
    let max_bytes: usize = 256 * 1024;
    let bytes = response.bytes().await.ok()?;
    let body = if bytes.len() > max_bytes {
        String::from_utf8_lossy(&bytes[..max_bytes]).into_owned()
    } else {
        String::from_utf8_lossy(&bytes).into_owned()
    };

    let title = extract_title(&body).unwrap_or_else(|| "Untitled".to_string());
    let snippet = extract_body_text(&body, 200);

    Some(LinkSummary { title, snippet })
}

/// Enrich a message by prepending link summaries for any URLs found in the text.
///
/// This is the main entry point called from the channel message processing pipeline.
/// If the enricher is disabled or no URLs are found, the original message is returned
/// unchanged.
pub async fn enrich_message(content: &str, config: &LinkEnricherConfig) -> String {
    if !config.enabled || config.max_links == 0 {
        return content.to_string();
    }

    let urls = extract_urls(content, config.max_links);
    if urls.is_empty() {
        return content.to_string();
    }

    // Filter out SSRF targets
    let safe_urls: Vec<&str> = urls
        .iter()
        .filter(|u| !is_ssrf_target(u))
        .map(|u| u.as_str())
        .collect();
    if safe_urls.is_empty() {
        return content.to_string();
    }

    let mut enrichments = Vec::new();
    for url in safe_urls {
        match fetch_link_summary(url, config.timeout_secs).await {
            Some(summary) => {
                enrichments.push(format!("[Link: {} — {}]", summary.title, summary.snippet));
            }
            None => {
                tracing::debug!(url, "Link enricher: failed to fetch or extract summary");
            }
        }
    }

    if enrichments.is_empty() {
        return content.to_string();
    }

    let prefix = enrichments.join("\n");
    format!("{prefix}\n{content}")
}

#[cfg(test)]
mod tests {
    use super::*;

    // ── URL extraction ──────────────────────────────────────────────

    #[test]
    fn extract_urls_finds_http_and_https() {
        let text = "Check https://example.com and http://test.org/page for info";
        let urls = extract_urls(text, 10);
        assert_eq!(urls, vec!["https://example.com", "http://test.org/page",]);
    }

    #[test]
    fn extract_urls_respects_max() {
        let text = "https://a.com https://b.com https://c.com https://d.com";
        let urls = extract_urls(text, 2);
        assert_eq!(urls.len(), 2);
        assert_eq!(urls[0], "https://a.com");
        assert_eq!(urls[1], "https://b.com");
    }

    #[test]
    fn extract_urls_deduplicates() {
        let text = "Visit https://example.com and https://example.com again";
        let urls = extract_urls(text, 10);
        assert_eq!(urls.len(), 1);
    }

    #[test]
    fn extract_urls_handles_no_urls() {
        let text = "Just a normal message without links";
        let urls = extract_urls(text, 10);
        assert!(urls.is_empty());
    }

    #[test]
    fn extract_urls_stops_at_angle_brackets() {
        let text = "Link: <https://example.com/path> done";
        let urls = extract_urls(text, 10);
        assert_eq!(urls, vec!["https://example.com/path"]);
    }

    #[test]
    fn extract_urls_stops_at_quotes() {
        let text = r#"href="https://example.com/page" end"#;
        let urls = extract_urls(text, 10);
        assert_eq!(urls, vec!["https://example.com/page"]);
    }

    // ── SSRF protection ─────────────────────────────────────────────

    #[test]
    fn ssrf_blocks_localhost() {
        assert!(is_ssrf_target("http://localhost/admin"));
        assert!(is_ssrf_target("https://localhost:8080/api"));
    }

    #[test]
    fn ssrf_blocks_loopback_ip() {
        assert!(is_ssrf_target("http://127.0.0.1/secret"));
        assert!(is_ssrf_target("http://127.0.0.2:9090"));
    }

    #[test]
    fn ssrf_blocks_private_10_network() {
        assert!(is_ssrf_target("http://10.0.0.1/internal"));
        assert!(is_ssrf_target("http://10.255.255.255"));
    }

    #[test]
    fn ssrf_blocks_private_172_network() {
        assert!(is_ssrf_target("http://172.16.0.1/admin"));
        assert!(is_ssrf_target("http://172.31.255.255"));
    }

    #[test]
    fn ssrf_blocks_private_192_168_network() {
        assert!(is_ssrf_target("http://192.168.1.1/router"));
        assert!(is_ssrf_target("http://192.168.0.100:3000"));
    }

    #[test]
    fn ssrf_blocks_link_local() {
        assert!(is_ssrf_target("http://169.254.0.1/metadata"));
        assert!(is_ssrf_target("http://169.254.169.254/latest"));
    }

    #[test]
    fn ssrf_blocks_ipv6_loopback() {
        // [::1] is parsed and caught by the deny-list as an IPv6 loopback.
        assert!(is_ssrf_target("http://[::1]/admin"));
        // IPv4-compatible IPv6 embedding a metadata IP is also blocked.
        assert!(is_ssrf_target("http://[::169.254.169.254]/"));
    }

    #[test]
    fn ssrf_blocks_dot_local() {
        assert!(is_ssrf_target("http://myhost.local/api"));
    }

    #[test]
    fn ssrf_allows_public_urls() {
        assert!(!is_ssrf_target("https://example.com/page"));
        assert!(!is_ssrf_target("https://www.google.com"));
        assert!(!is_ssrf_target("http://93.184.216.34/resource"));
    }

    #[test]
    fn ssrf_blocks_cgnat_via_shared_denylist() {
        // #402: routing literal-IP checks through the shared deny-list extends
        // coverage to ranges the old local check missed (e.g. 100.64.0.0/10 CGNAT).
        assert!(is_ssrf_target("http://100.64.0.1/x"));
    }

    #[test]
    fn ssrf_blocks_userinfo_literal_ip() {
        // #402 re-review: userinfo must not hide a private host. The real host
        // is 127.0.0.1 / 169.254.169.254, which reqwest connects to directly
        // (skipping the resolver, as it's a literal IP), so catch it here.
        assert!(is_ssrf_target("http://trusted.com@127.0.0.1/admin"));
        assert!(is_ssrf_target("http://user:pass@169.254.169.254/latest"));
    }

    #[test]
    fn ssrf_blocks_alternative_ip_encodings() {
        // #402 re-review: decimal/hex IPv4 encodings normalize to a literal IP
        // (which skips the connect-time resolver), so they must be blocked here.
        assert!(is_ssrf_target("http://2130706433/")); // 127.0.0.1 decimal
        assert!(is_ssrf_target("http://0x7f000001/")); // 127.0.0.1 hex
    }

    // ── Title extraction ────────────────────────────────────────────

    #[test]
    fn extract_title_basic() {
        let html = "<html><head><title>My Page Title</title></head><body>Hello</body></html>";
        assert_eq!(extract_title(html), Some("my page title".to_string()));
    }

    #[test]
    fn extract_title_with_entities() {
        let html = "<title>Tom &amp; Jerry&#39;s Page</title>";
        assert_eq!(extract_title(html), Some("tom & jerry's page".to_string()));
    }

    #[test]
    fn extract_title_case_insensitive() {
        let html = "<HTML><HEAD><TITLE>Upper Case</TITLE></HEAD></HTML>";
        assert_eq!(extract_title(html), Some("upper case".to_string()));
    }

    #[test]
    fn extract_title_multibyte_chars_no_panic() {
        // İ (U+0130) lowercases to 2 chars, changing byte length.
        // This must not panic or produce wrong offsets.
        let html = "<title>İstanbul Guide</title>";
        let result = extract_title(html);
        assert!(result.is_some());
        let title = result.unwrap();
        assert!(title.contains("stanbul"));
    }

    #[test]
    fn extract_title_missing() {
        let html = "<html><body>No title here</body></html>";
        assert_eq!(extract_title(html), None);
    }

    #[test]
    fn extract_title_empty() {
        let html = "<title>   </title>";
        assert_eq!(extract_title(html), None);
    }

    // ── Body text extraction ────────────────────────────────────────

    #[test]
    fn extract_body_text_strips_html() {
        let html = "<html><body><h1>Header</h1><p>Some content here</p></body></html>";
        let text = extract_body_text(html, 200);
        assert!(text.contains("Header"));
        assert!(text.contains("Some content"));
        assert!(!text.contains("<h1>"));
    }

    #[test]
    fn extract_body_text_truncates() {
        let html = "<p>A very long paragraph that should be truncated to fit within the limit.</p>";
        let text = extract_body_text(html, 20);
        assert!(text.len() <= 25); // 20 chars + "..."
        assert!(text.ends_with("..."));
    }

    // ── Config toggle ───────────────────────────────────────────────

    #[tokio::test]
    async fn enrich_message_disabled_returns_original() {
        let config = LinkEnricherConfig {
            enabled: false,
            max_links: 3,
            timeout_secs: 10,
        };
        let msg = "Check https://example.com for details";
        let result = enrich_message(msg, &config).await;
        assert_eq!(result, msg);
    }

    #[tokio::test]
    async fn enrich_message_no_urls_returns_original() {
        let config = LinkEnricherConfig {
            enabled: true,
            max_links: 3,
            timeout_secs: 10,
        };
        let msg = "No links in this message";
        let result = enrich_message(msg, &config).await;
        assert_eq!(result, msg);
    }

    #[tokio::test]
    async fn enrich_message_ssrf_urls_returns_original() {
        let config = LinkEnricherConfig {
            enabled: true,
            max_links: 3,
            timeout_secs: 10,
        };
        let msg = "Try http://127.0.0.1/admin and http://192.168.1.1/router";
        let result = enrich_message(msg, &config).await;
        assert_eq!(result, msg);
    }

    #[test]
    fn default_config_is_disabled() {
        let config = LinkEnricherConfig::default();
        assert!(!config.enabled);
        assert_eq!(config.max_links, 3);
        assert_eq!(config.timeout_secs, 10);
    }
}