fn is_cjk(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}' | '\u{2E80}'..='\u{2EFF}' | '\u{3000}'..='\u{303F}' | '\u{31F0}'..='\u{31FF}' | '\u{3200}'..='\u{32FF}' | '\u{FE30}'..='\u{FE4F}' | '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}' )
}
fn has_degenerate_cjk_run(line: &str) -> bool {
let chars: Vec<char> = line.chars().collect();
if chars.is_empty() {
return false;
}
let has_cjk = chars.iter().any(|c| is_cjk(*c));
if !has_cjk {
return false;
}
if is_symbol_flood(line) {
return true;
}
if has_repeated_symbol(line, 5) {
return true;
}
false
}
fn has_repeated_symbol(line: &str, threshold: u32) -> bool {
let chars: Vec<char> = line.chars().collect();
let mut run = 1u32;
for i in 1..chars.len() {
if chars[i] == chars[i - 1] && !chars[i].is_alphanumeric() && chars[i] != ' ' {
run += 1;
if run >= threshold {
return true;
}
} else {
run = 1;
}
}
false
}
fn is_symbol_flood(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.len() < 10 {
return false;
}
let chars: Vec<char> = trimmed.chars().collect();
let mut max_run = 1u32;
let mut current_run = 1u32;
for i in 1..chars.len() {
if chars[i] == chars[i - 1] && !chars[i].is_alphanumeric() && chars[i] != ' ' {
current_run += 1;
if current_run > max_run {
max_run = current_run;
}
} else {
current_run = 1;
}
}
max_run >= 10
}
pub fn sanitize(output: &str) -> String {
if output.is_empty() {
return output.to_string();
}
let mut cleaned = Vec::new();
let mut removed = 0usize;
for line in output.lines() {
if has_degenerate_cjk_run(line) || is_symbol_flood(line) {
removed += 1;
continue;
}
cleaned.push(line);
}
if removed == 0 {
return output.to_string();
}
let result = cleaned.join("\n");
if removed > 0 {
tracing::debug!("[sanitizer] removed {removed} degenerate line(s) from output");
}
result
}
pub fn ascii_safe_symbols(text: &str) -> String {
text.replace('\u{2192}', "->")
.replace('←', "<-")
.replace('∴', ":.")
.replace('≈', "~=")
.replace('≠', "!=")
.replace('∈', "in")
.replace('∅', "(none)")
.replace('⊕', "+")
.replace('⊖', "-")
.replace('Δ', "delta")
.replace('✓', "ok")
.replace('✗', "FAIL")
.replace('⚠', "WARN")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clean_passes_normal_english() {
let input = "fn main() {\n println!(\"hello\");\n}";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_removes_degenerate_cjk_with_symbol_flood() {
let input = "Explored 22 files, 14 searches\n肛裂!!!!!!!!!!!!!!!!!!\nExploring >";
let cleaned = sanitize(input);
assert!(!cleaned.contains("肛裂"));
assert!(cleaned.contains("Explored 22"));
assert!(cleaned.contains("Exploring"));
}
#[test]
fn clean_preserves_genuine_cjk_content() {
let input = "这是一个正常的中文文档,包含完整的句子结构。";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_mixed_cjk_english_header() {
let input = "## 配置说明 (Configuration)";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_path_with_cjk() {
let input = "path/to/文件.md";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_status_message_with_cjk() {
let input = "Build: 编译完成 ✓";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_mixed_cjk_english_docs() {
let input = "The function 関数 is documented in 文档 for reference.";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_multilingual_paragraph() {
let input =
"This module handles 数据处理 (data processing) and 文件管理 (file management).";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_cjk_in_code_comments() {
let input = "// 初始化配置 — initialize configuration";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_korean_mixed_content() {
let input = "Build status: 빌드 성공 (success)";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_preserves_japanese_mixed_content() {
let input = "Error in モジュール module: connection timeout";
assert_eq!(sanitize(input), input);
}
#[test]
fn clean_removes_symbol_flood() {
let input = "normal line\n!!!!!!!!!!!!!!!!!!!!!!!\nanother line";
let cleaned = sanitize(input);
assert!(!cleaned.contains("!!!!!!!!!!!!"));
assert!(cleaned.contains("normal line"));
assert!(cleaned.contains("another line"));
}
#[test]
fn clean_preserves_normal_punctuation() {
let input = "Error: something failed!!";
assert_eq!(sanitize(input), input);
}
#[test]
fn ascii_safe_replaces_unicode_symbols() {
let out = ascii_safe_symbols("fn -> result ok or FAIL");
assert_eq!(out, "fn -> result ok or FAIL");
}
#[test]
fn ascii_safe_replaces_math_symbols() {
let out = ascii_safe_symbols("A ≠ B, C ≈ D, x ∈ set, ∅");
assert_eq!(out, "A != B, C ~= D, x in set, (none)");
}
#[test]
fn degenerate_cjk_with_symbol_flood() {
assert!(has_degenerate_cjk_run("肛裂!!!!!!!!!!"));
}
#[test]
fn degenerate_cjk_with_repeated_symbols() {
assert!(has_degenerate_cjk_run("乱码!!!!!garbled"));
}
#[test]
fn legitimate_mixed_cjk_not_flagged() {
assert!(!has_degenerate_cjk_run("result: 乱码输 garbled"));
assert!(!has_degenerate_cjk_run("## 配置说明 (Configuration)"));
assert!(!has_degenerate_cjk_run("Build: 编译完成 ✓"));
assert!(!has_degenerate_cjk_run("path/to/文件.md"));
}
#[test]
fn genuine_cjk_line_not_flagged() {
assert!(!has_degenerate_cjk_run("这是完整的中文内容,不是乱码"));
}
#[test]
fn short_cjk_pair_not_flagged() {
assert!(!has_degenerate_cjk_run("the 変数 variable"));
}
#[test]
fn empty_input() {
assert_eq!(sanitize(""), "");
}
#[test]
fn symbol_flood_exact_threshold() {
assert!(!is_symbol_flood("!!!!!!!!!")); assert!(is_symbol_flood("!!!!!!!!!!")); }
#[test]
fn multiline_mixed_cjk_preserved() {
let input =
"# 项目文档\nThis is the 配置 section.\n## 安装步骤 (Installation)\nRun: cargo build";
assert_eq!(sanitize(input), input);
}
#[test]
fn cjk_filename_in_output_preserved() {
let input = "Modified: src/核心/处理器.rs\nCompiled: 3 files";
assert_eq!(sanitize(input), input);
}
}