raymon 0.3.0

Stateful MCP server and TUI for Ray-style logs
Documentation
//! Sanitization helpers for inbound payload content.
//!
//! Ray clients sometimes send very large blobs (e.g. base64 data URLs) or HTML fragments from debug
//! tools (like Symfony VarDumper). These helpers normalize and redact such content so the TUI and
//! storage remain responsive.

use crate::raymon_core::Entry;
use serde_json::Value;

const BLOB_STRING_LEN_THRESHOLD: usize = 16 * 1024;
const BLOB_STRING_PLACEHOLDER: &str = "[[raymon:blob redacted]]";

/// Sanitize all payload contents in-place.
///
/// - Strips Symfony VarDumper HTML into plain text (best-effort).
/// - Redacts large base64/data-url blobs.
pub fn sanitize_entry(entry: &mut Entry) {
    for payload in &mut entry.payloads {
        sanitize_value(&mut payload.content);
    }
}

fn sanitize_value(value: &mut Value) {
    match value {
        Value::String(text) => {
            if let Some(cleaned) = sanitize_symfony_var_dumper_html(text) {
                *text = cleaned;
            }
            if should_redact_blob_string(text) {
                *text = BLOB_STRING_PLACEHOLDER.to_string();
            }
        }
        Value::Array(items) => {
            for item in items {
                sanitize_value(item);
            }
        }
        Value::Object(map) => {
            for (_, item) in map.iter_mut() {
                sanitize_value(item);
            }
        }
        _ => {}
    }
}

fn should_redact_blob_string(value: &str) -> bool {
    if value.len() < BLOB_STRING_LEN_THRESHOLD {
        return false;
    }

    let trimmed = value.trim();
    if trimmed.starts_with("data:") && trimmed.contains(";base64,") {
        return true;
    }

    looks_like_base64(trimmed)
}

fn looks_like_base64(value: &str) -> bool {
    if value.is_empty() || !value.is_ascii() {
        return false;
    }

    const SAMPLE_BYTES: usize = 512;

    for &byte in value.as_bytes().iter().take(SAMPLE_BYTES) {
        match byte {
            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'/' | b'-' | b'_' | b'=' => {}
            b'\n' | b'\r' => {}
            _ => return false,
        }
    }

    true
}

fn sanitize_symfony_var_dumper_html(input: &str) -> Option<String> {
    if !looks_like_symfony_var_dumper_html(input) {
        return None;
    }

    let mut cleaned = strip_html_tag_blocks(input, "script");
    cleaned = strip_html_tag_blocks(&cleaned, "style");
    cleaned = strip_html_tags(&cleaned);
    cleaned = decode_html_entities_lossy(&cleaned);
    let trimmed = cleaned.trim();
    if trimmed.is_empty() {
        None
    } else {
        Some(trimmed.to_string())
    }
}

fn looks_like_symfony_var_dumper_html(input: &str) -> bool {
    let trimmed = input.trim_start();
    if trimmed.starts_with("<script") && trimmed.contains("Sfdump") && trimmed.contains("sf-dump") {
        return true;
    }
    if trimmed.starts_with("<pre") && trimmed.contains("sf-dump") {
        return true;
    }
    input.contains("sf-dump") && input.contains("Sfdump")
}

fn strip_html_tag_blocks(input: &str, tag: &str) -> String {
    let open_tag = format!("<{tag}");
    let close_tag = format!("</{tag}>");
    let mut out = String::with_capacity(input.len());
    let mut cursor = 0usize;

    while let Some(rel_start) = input[cursor..].find(&open_tag) {
        let start = cursor + rel_start;
        out.push_str(&input[cursor..start]);

        let Some(rel_open_end) = input[start..].find('>') else {
            cursor = input.len();
            break;
        };
        let open_end = start + rel_open_end + 1;

        let Some(rel_close) = input[open_end..].find(&close_tag) else {
            cursor = input.len();
            break;
        };
        cursor = open_end + rel_close + close_tag.len();
    }

    out.push_str(&input[cursor..]);
    out
}

fn strip_html_tags(input: &str) -> String {
    let mut out = String::with_capacity(input.len());
    let mut in_tag = false;
    for ch in input.chars() {
        match ch {
            '<' => in_tag = true,
            '>' if in_tag => in_tag = false,
            _ if !in_tag => out.push(ch),
            _ => {}
        }
    }
    out
}

fn decode_html_entities_lossy(input: &str) -> String {
    fn decode_entity(entity: &str) -> Option<char> {
        match entity {
            "nbsp" => Some(' '),
            "lt" => Some('<'),
            "gt" => Some('>'),
            "quot" => Some('"'),
            "amp" => Some('&'),
            "apos" => Some('\''),
            "#039" | "#39" => Some('\''),
            _ if entity.starts_with("#x") || entity.starts_with("#X") => {
                let value = u32::from_str_radix(&entity[2..], 16).ok()?;
                char::from_u32(value)
            }
            _ if entity.starts_with('#') => {
                let value = entity[1..].parse::<u32>().ok()?;
                char::from_u32(value)
            }
            _ => None,
        }
    }

    let mut out = String::with_capacity(input.len());
    let mut cursor = 0usize;

    while let Some(rel_start) = input[cursor..].find('&') {
        let start = cursor + rel_start;
        out.push_str(&input[cursor..start]);

        let Some(rel_end) = input[start..].find(';') else {
            out.push_str(&input[start..]);
            return out;
        };
        let end = start + rel_end;
        let entity = &input[start + 1..end];

        if let Some(ch) = decode_entity(entity) {
            out.push(ch);
        } else {
            out.push_str(&input[start..=end]);
        }

        cursor = end + 1;
    }

    out.push_str(&input[cursor..]);
    out
}

#[cfg(test)]
mod tests {
    use super::sanitize_entry;
    use crate::raymon_core::{Entry, Origin, Payload, Screen};
    use serde_json::json;

    fn origin() -> Origin {
        Origin {
            project: "proj".to_string(),
            host: "host".to_string(),
            screen: Some(Screen::new("proj:host:default")),
            session_id: None,
            function_name: None,
            file: None,
            line_number: None,
        }
    }

    fn entry_with_value(value: &str) -> Entry {
        Entry {
            uuid: "uuid".to_string(),
            received_at: 0,
            project: "proj".to_string(),
            host: "host".to_string(),
            screen: Screen::new("proj:host:default"),
            session_id: None,
            payloads: vec![Payload {
                r#type: "log".to_string(),
                content: json!({ "values": [value] }),
                origin: origin(),
            }],
        }
    }

    #[test]
    fn sanitizes_symfony_var_dumper_html() {
        let dump = r#"<script> Sfdump = window.Sfdump || (function () {})</script>
<pre class=sf-dump id=sf-dump-1 data-indent-pad="  ">
<span class=sf-dump-note>array:2</span> [<samp>
  <span class=sf-dump-index>0</span> => <span class=sf-dump-num>12</span>
  <span class=sf-dump-index>1</span> => <span class=sf-dump-num>3</span> <span>&#9654;</span>
</samp>]
</pre><script>Sfdump(\"sf-dump-1\")</script>"#;

        let mut entry = entry_with_value(dump);
        sanitize_entry(&mut entry);

        let sanitized = entry.payloads[0]
            .content
            .get("values")
            .and_then(|value| value.as_array())
            .and_then(|values| values.first())
            .and_then(|value| value.as_str())
            .expect("sanitized dump value");

        assert!(sanitized.contains("array:2"));
        assert!(sanitized.contains("0 => 12"));
        assert!(sanitized.contains("1 => 3"));
        assert!(sanitized.contains(''));
        assert!(!sanitized.contains("Sfdump = window.Sfdump"));
        assert!(!sanitized.contains("<span"));
        assert!(!sanitized.contains("<script"));
        assert!(!sanitized.contains("sf-dump"));
    }

    #[test]
    fn leaves_other_html_strings_alone() {
        let html = "<b>hi</b>";
        let mut entry = entry_with_value(html);
        sanitize_entry(&mut entry);
        let sanitized = entry.payloads[0]
            .content
            .get("values")
            .and_then(|value| value.as_array())
            .and_then(|values| values.first())
            .and_then(|value| value.as_str())
            .expect("html value");
        assert_eq!(sanitized, html);
    }

    #[test]
    fn redacts_large_base64_strings() {
        let blob = "A".repeat(super::BLOB_STRING_LEN_THRESHOLD + 8);
        let mut entry = entry_with_value(&blob);
        sanitize_entry(&mut entry);

        let sanitized = entry.payloads[0]
            .content
            .get("values")
            .and_then(|value| value.as_array())
            .and_then(|values| values.first())
            .and_then(|value| value.as_str())
            .expect("sanitized value");

        assert_eq!(sanitized, super::BLOB_STRING_PLACEHOLDER);
    }

    #[test]
    fn redacts_data_uri_base64_strings() {
        let blob =
            format!("data:image/png;base64,{}", "A".repeat(super::BLOB_STRING_LEN_THRESHOLD + 8));
        let mut entry = entry_with_value(&blob);
        sanitize_entry(&mut entry);

        let sanitized = entry.payloads[0]
            .content
            .get("values")
            .and_then(|value| value.as_array())
            .and_then(|values| values.first())
            .and_then(|value| value.as_str())
            .expect("sanitized value");

        assert_eq!(sanitized, super::BLOB_STRING_PLACEHOLDER);
    }

    #[test]
    fn keeps_small_base64_strings() {
        let blob = "A".repeat(256);
        let mut entry = entry_with_value(&blob);
        sanitize_entry(&mut entry);

        let sanitized = entry.payloads[0]
            .content
            .get("values")
            .and_then(|value| value.as_array())
            .and_then(|values| values.first())
            .and_then(|value| value.as_str())
            .expect("sanitized value");

        assert_eq!(sanitized, blob);
    }
}