use regex::Regex;
use std::sync::LazyLock;
static MARKERS_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"<!--\s*smos:\S+?\s*-->|(?s)<smos-memory[^>]*>.*?</smos-memory>")
.expect("markers regex literal")
});
static THINK_CLOSED_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)<think>.*?</think>").expect("think-closed regex literal"));
static THINK_OPEN_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)<think>.*$").expect("think-open regex literal"));
static BARE_SESS_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(^|[^A-Za-z0-9_])sess_[A-Za-z0-9_]+").expect("bare sess regex literal")
});
pub fn clean(content: &str) -> String {
let without_markers = MARKERS_RE.replace_all(content, "");
let without_think_closed = THINK_CLOSED_RE.replace_all(&without_markers, "");
let without_think_open = THINK_OPEN_RE.replace_all(&without_think_closed, "");
let without_bare = BARE_SESS_RE.replace_all(&without_think_open, "${1}");
without_bare.trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strips_session_marker_comment() {
let input = "hello\n<!-- smos:sess_abcdef012345 -->";
assert_eq!(clean(input), "hello");
}
#[test]
fn strips_multiline_smos_memory_block() {
let input = "before\n<smos-memory session=\"sess_x\">\n[fact_1] doc\n</smos-memory>\nafter";
let out = clean(input);
assert!(out.contains("before"));
assert!(out.contains("after"));
assert!(!out.contains("smos-memory"));
assert!(!out.contains("fact_1"));
}
#[test]
fn strips_smos_memory_block_with_attributes() {
let input = "<smos-memory session=\"sess_y\" extra=\"value\">body</smos-memory>tail";
let out = clean(input);
assert_eq!(out, "tail");
}
#[test]
fn strips_bare_session_id_preserving_surrounding_text() {
let input = "the session id is sess_abcdef012345 here";
assert_eq!(clean(input), "the session id is here");
}
#[test]
fn preserves_session_id_embedded_in_a_word() {
let input = "obsess_token must survive";
assert_eq!(clean(input), "obsess_token must survive");
}
#[test]
fn preserves_normal_content_without_noise() {
let input = "Just a regular fact about Rust and cargo.";
assert_eq!(clean(input), input);
}
#[test]
fn empty_input_yields_empty_output() {
assert_eq!(clean(""), "");
assert_eq!(clean(" "), "");
}
#[test]
fn strips_bare_id_at_start_of_text() {
let input = "sess_aabbccddeeff is the id";
assert_eq!(clean(input), "is the id");
}
#[test]
fn strips_multiple_distinct_noise_patterns_in_one_pass() {
let input = "marker <!-- smos:sess_1 --> bare sess_aabbccddeeff block <smos-memory session=\"s\">x</smos-memory>";
let out = clean(input);
assert_eq!(out, "marker bare block");
}
#[test]
fn strips_closed_think_block_at_start() {
let input = "<think>let me reason about this</think>The answer is 42.";
assert_eq!(clean(input), "The answer is 42.");
}
#[test]
fn strips_closed_think_block_in_middle() {
let input = "Before.<think>internal deliberation</think>After.";
assert_eq!(clean(input), "Before.After.");
}
#[test]
fn strips_closed_think_block_at_end() {
let input = "Real fact here.<think>and some trailing rumination</think>";
assert_eq!(clean(input), "Real fact here.");
}
#[test]
fn strips_multiline_think_block_body() {
let input = "fact\n<think>line one\nline two\nline three</think>\nmore fact";
assert_eq!(clean(input), "fact\n\nmore fact");
}
#[test]
fn strips_unclosed_think_to_end_of_string() {
let input = "answer<think>reasoning that never got a closing tag";
assert_eq!(clean(input), "answer");
}
#[test]
fn strips_adjacent_closed_think_blocks() {
let input = "<think>A</think><think>B</think>final";
assert_eq!(clean(input), "final");
}
#[test]
fn closed_think_stripped_before_unclosed_pass() {
let input = "<think>closed reasoning</think>fact<think>unclosed trail";
assert_eq!(clean(input), "fact");
}
#[test]
fn normal_text_without_think_is_unchanged() {
let input = "The cache uses TTL=60 to avoid stale entries.";
assert_eq!(clean(input), input);
}
#[test]
fn strips_think_combined_with_markers_and_bare_id() {
let input = "<think>noise</think>real fact <!-- smos:sess_1 --> sess_aabb <smos-memory session=\"s\">x</smos-memory>";
let out = clean(input);
assert_eq!(out, "real fact");
}
}