fn is_cjk(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}' | '\u{2E80}'..='\u{2EFF}' | '\u{3000}'..='\u{303F}' | '\u{31F0}'..='\u{31FF}' | '\u{3200}'..='\u{32FF}' | '\u{FE30}'..='\u{FE4F}' | '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}' )
}
fn has_degenerate_cjk_run(line: &str) -> bool {
let chars: Vec<char> = line.chars().collect();
let total = chars.len();
if total == 0 {
return false;
}
let cjk_count = chars.iter().filter(|c| is_cjk(**c)).count();
if cjk_count == 0 {
return false;
}
let cjk_ratio = cjk_count as f64 / total as f64;
if cjk_ratio > 0.6 {
return false;
}
if cjk_count >= 1 && is_symbol_flood(line) {
return true;
}
if cjk_count >= 1 && has_repeated_symbol(line, 5) {
return true;
}
let mut consecutive = 0u32;
for &c in &chars {
if is_cjk(c) {
consecutive += 1;
if consecutive >= 3 {
return true;
}
} else {
consecutive = 0;
}
}
false
}
fn has_repeated_symbol(line: &str, threshold: u32) -> bool {
let chars: Vec<char> = line.chars().collect();
let mut run = 1u32;
for i in 1..chars.len() {
if chars[i] == chars[i - 1] && !chars[i].is_alphanumeric() && chars[i] != ' ' {
run += 1;
if run >= threshold {
return true;
}
} else {
run = 1;
}
}
false
}
fn is_symbol_flood(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.len() < 10 {
return false;
}
let chars: Vec<char> = trimmed.chars().collect();
let mut max_run = 1u32;
let mut current_run = 1u32;
for i in 1..chars.len() {
if chars[i] == chars[i - 1] && !chars[i].is_alphanumeric() && chars[i] != ' ' {
current_run += 1;
if current_run > max_run {
max_run = current_run;
}
} else {
current_run = 1;
}
}
max_run >= 10
}
pub fn sanitize(output: &str) -> String {
if output.is_empty() {
return output.to_string();
}
let mut cleaned = Vec::new();
let mut removed = 0usize;
for line in output.lines() {
if has_degenerate_cjk_run(line) || is_symbol_flood(line) {
removed += 1;
continue;
}
cleaned.push(line);
}
if removed == 0 {
return output.to_string();
}
let result = cleaned.join("\n");
if removed > 0 {
tracing::debug!("[sanitizer] removed {removed} degenerate line(s) from output");
}
result
}
pub fn ascii_safe_symbols(text: &str) -> String {
text.replace('→', "->")
.replace('←', "<-")
.replace('∴', ":.")
.replace('≈', "~=")
.replace('≠', "!=")
.replace('∈', "in")
.replace('∅', "(none)")
.replace('⊕', "+")
.replace('⊖', "-")
.replace('Δ', "delta")
.replace('✓', "ok")
.replace('✗', "FAIL")
.replace('⚠', "WARN")
.replace('\u{2192}', "->") }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clean_passes_normal_english() {
let input = "fn main() {\n println!(\"hello\");\n}";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_removes_degenerate_cjk_in_english() {
let input = "Explored 22 files, 14 searches\n肛裂!!!!!!!!!!!!!!!!!!\nExploring >";
let cleaned = sanitize(input);
assert!(!cleaned.contains("肛裂"));
assert!(cleaned.contains("Explored 22"));
assert!(cleaned.contains("Exploring"));
}
#[test]
fn clean_preserves_genuine_cjk_content() {
let input = "这是一个正常的中文文档,包含完整的句子结构。";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_mixed_cjk_english_docs() {
let input = "The function 関数 is documented in 文档 for reference.";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_removes_symbol_flood() {
let input = "normal line\n!!!!!!!!!!!!!!!!!!!!!!!\nanother line";
let cleaned = sanitize(input);
assert!(!cleaned.contains("!!!!!!!!!!!!"));
assert!(cleaned.contains("normal line"));
assert!(cleaned.contains("another line"));
}
#[test]
fn clean_preserves_normal_punctuation() {
let input = "Error: something failed!!";
assert_eq!(sanitize(input), input);
}
#[test]
fn ascii_safe_replaces_unicode_symbols() {
let out = ascii_safe_symbols("fn → result ✓ or ✗");
assert_eq!(out, "fn -> result ok or FAIL");
}
#[test]
fn ascii_safe_replaces_math_symbols() {
let out = ascii_safe_symbols("A ≠ B, C ≈ D, x ∈ set, ∅");
assert_eq!(out, "A != B, C ~= D, x in set, (none)");
}
#[test]
fn degenerate_cjk_mixed_with_exclamation() {
assert!(has_degenerate_cjk_run("肛裂!!!!!!"));
assert!(has_degenerate_cjk_run("result: 乱码输 garbled"));
}
#[test]
fn genuine_cjk_line_not_flagged() {
assert!(!has_degenerate_cjk_run("这是完整的中文内容,不是乱码"));
}
#[test]
fn short_cjk_pair_not_flagged() {
assert!(!has_degenerate_cjk_run("the 変数 variable"));
}
#[test]
fn empty_input() {
assert_eq!(sanitize(""), "");
}
#[test]
fn symbol_flood_exact_threshold() {
assert!(!is_symbol_flood("!!!!!!!!!")); assert!(is_symbol_flood("!!!!!!!!!!")); }
}